diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 586eb2f3cf45e..bc20e1be83817 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -27641,7 +27641,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { // If inserting an UNDEF, just return the original vector. if (N1.isUndef()) - return N0; + return DAG.getFreeze(N0); // If this is an insert of an extracted vector into an undef vector, we can // just use the input to the extract if the types match, and can simplify diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 6df21b624137f..f8001028c670b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7962,7 +7962,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // If the inserted element is an UNDEF, just use the input vector. if (N2.isUndef()) - return N1; + return getFreeze(N1); break; } @@ -8001,7 +8001,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // can just use the input to the extract. if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR && N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT) - return N2.getOperand(0); + return getFreeze(N2.getOperand(0)); break; } case ISD::BITCAST: @@ -8028,7 +8028,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, "Vector and mask must have same number of elements."); if (N1.isUndef() || N2.isUndef()) - return N3; + return getFreeze(N3); break; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ced29f8fb3d0c..63f06e3f58b47 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45047,6 +45047,7 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: + case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector Mask; SmallVector Ops; @@ -45094,10 +45095,13 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::BLENDV: return false; // SSE target shuffles. + case X86ISD::PACKSS: + case X86ISD::PACKUS: case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: + case X86ISD::VPERMV: case X86ISD::VPERMV3: return false; // SSE comparisons handle all icmp/fcmp cases. diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll index 888aa9d7f9cdc..0366e630aea44 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll @@ -132,7 +132,7 @@ define @dup_extract_nxv2i64_v2i64(<2 x i64> %data) { define @dup_extract_nxv2i64_v1i64(<1 x i64> %data) { ; CHECK-LABEL: dup_extract_nxv2i64_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: ret %1 = extractelement <1 x i64> %data, i64 1 %.splatinsert = insertelement poison, i64 %1, i32 0 diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index cc05836d3d156..8cfa9bfceae88 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -304,9 +304,12 @@ define <16 x i1> @lane_mask_v16i1_i8(i8 %index, i8 %TC) { ; ; CHECK-STREAMING-LABEL: lane_mask_v16i1_i8: ; CHECK-STREAMING: // %bb.0: -; CHECK-STREAMING-NEXT: index z0.b, w0, #1 -; CHECK-STREAMING-NEXT: mov z1.b, w0 +; CHECK-STREAMING-NEXT: index z0.b, #0, #1 ; CHECK-STREAMING-NEXT: ptrue p0.b, vl16 +; CHECK-STREAMING-NEXT: mov z1.b, w0 +; CHECK-STREAMING-NEXT: mov z0.b, p0/m, z0.b +; CHECK-STREAMING-NEXT: sel z1.b, p0, z1.b, z0.b +; CHECK-STREAMING-NEXT: add z0.b, z1.b, z0.b ; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z1.b, z0.b ; CHECK-STREAMING-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-STREAMING-NEXT: orr z0.d, z0.d, z1.d @@ -331,9 +334,12 @@ define <8 x i1> @lane_mask_v8i1_i8(i8 %index, i8 %TC) { ; ; CHECK-STREAMING-LABEL: lane_mask_v8i1_i8: ; CHECK-STREAMING: // %bb.0: -; CHECK-STREAMING-NEXT: index z0.b, w0, #1 -; CHECK-STREAMING-NEXT: mov z1.b, w0 +; CHECK-STREAMING-NEXT: index z0.b, #0, #1 ; CHECK-STREAMING-NEXT: ptrue p0.b, vl8 +; CHECK-STREAMING-NEXT: mov z1.b, w0 +; CHECK-STREAMING-NEXT: mov z0.b, p0/m, z0.b +; CHECK-STREAMING-NEXT: sel z1.b, p0, z1.b, z0.b +; CHECK-STREAMING-NEXT: add z0.b, z1.b, z0.b ; CHECK-STREAMING-NEXT: cmphi p1.b, p0/z, z1.b, z0.b ; CHECK-STREAMING-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff ; CHECK-STREAMING-NEXT: orr z0.d, z0.d, z1.d @@ -362,15 +368,20 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) { ; ; CHECK-STREAMING-LABEL: lane_mask_v4i1_i8: ; CHECK-STREAMING: // %bb.0: -; CHECK-STREAMING-NEXT: mov z1.h, w0 -; CHECK-STREAMING-NEXT: index z0.h, #0, #1 +; CHECK-STREAMING-NEXT: mov z0.h, #255 // =0xff ; CHECK-STREAMING-NEXT: ptrue p0.h, vl4 -; CHECK-STREAMING-NEXT: and z1.h, z1.h, #0xff -; CHECK-STREAMING-NEXT: add z0.h, z1.h, z0.h -; CHECK-STREAMING-NEXT: mov z1.h, w1 -; CHECK-STREAMING-NEXT: umin z0.h, z0.h, #255 -; CHECK-STREAMING-NEXT: and z1.h, z1.h, #0xff -; CHECK-STREAMING-NEXT: cmphi p0.h, p0/z, z1.h, z0.h +; CHECK-STREAMING-NEXT: index z2.h, #0, #1 +; CHECK-STREAMING-NEXT: mov z1.h, w0 +; CHECK-STREAMING-NEXT: mov z3.h, w1 +; CHECK-STREAMING-NEXT: mov z0.h, p0/m, z0.h +; CHECK-STREAMING-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-STREAMING-NEXT: sel z2.h, p0, z2.h, z0.h +; CHECK-STREAMING-NEXT: sel z3.h, p0, z3.h, z0.h +; CHECK-STREAMING-NEXT: and z1.d, z1.d, z0.d +; CHECK-STREAMING-NEXT: add z1.h, z1.h, z2.h +; CHECK-STREAMING-NEXT: and z2.d, z3.d, z0.d +; CHECK-STREAMING-NEXT: umin z0.h, p0/m, z0.h, z1.h +; CHECK-STREAMING-NEXT: cmphi p0.h, p0/z, z2.h, z0.h ; CHECK-STREAMING-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-STREAMING-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-STREAMING-NEXT: ret @@ -394,10 +405,14 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) { ; ; CHECK-STREAMING-LABEL: lane_mask_v2i1_i8: ; CHECK-STREAMING: // %bb.0: -; CHECK-STREAMING-NEXT: and w8, w0, #0xff +; CHECK-STREAMING-NEXT: index z0.s, #0, #1 ; CHECK-STREAMING-NEXT: ptrue p0.s, vl2 -; CHECK-STREAMING-NEXT: index z0.s, w8, #1 +; CHECK-STREAMING-NEXT: and w8, w0, #0xff +; CHECK-STREAMING-NEXT: mov z1.s, w8 ; CHECK-STREAMING-NEXT: and w8, w1, #0xff +; CHECK-STREAMING-NEXT: mov z0.s, p0/m, z0.s +; CHECK-STREAMING-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-STREAMING-NEXT: add z0.s, z1.s, z0.s ; CHECK-STREAMING-NEXT: mov z1.s, w8 ; CHECK-STREAMING-NEXT: umin z0.s, z0.s, #255 ; CHECK-STREAMING-NEXT: cmphi p0.s, p0/z, z1.s, z0.s diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll index 82802c79c7085..c6fff3e3d3181 100644 --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -57,8 +57,8 @@ define void @widen_f16_build_vector(ptr %addr) { ; CHECK-LABEL: widen_f16_build_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #13294 // =0x33ee -; CHECK-NEXT: movk w8, #13294, lsl #16 -; CHECK-NEXT: str w8, [x0] +; CHECK-NEXT: dup v0.4h, w8 +; CHECK-NEXT: str s0, [x0] ; CHECK-NEXT: ret store <2 x half> , ptr %addr, align 2 ret void diff --git a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll index 34899cb47dba3..545da98034527 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector-add-combine.ll @@ -94,16 +94,14 @@ define i32 @combine_undef_add_8xi32(i32 %a, i32 %b, i32 %c, i32 %d) local_unname ; CHECK-LABEL: combine_undef_add_8xi32: ; CHECK: // %bb.0: ; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: mov v1.s[1], w1 -; CHECK-NEXT: uhadd v0.4h, v0.4h, v0.4h ; CHECK-NEXT: mov v1.s[2], w2 ; CHECK-NEXT: mov v1.s[3], w3 -; CHECK-NEXT: xtn v2.4h, v1.4s -; CHECK-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-NEXT: uhadd v1.4h, v2.4h, v1.4h -; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: uaddlv s0, v1.8h +; CHECK-NEXT: uzp2 v2.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uhadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uaddlv s0, v0.8h ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %a1 = insertelement <8 x i32> poison, i32 %a, i32 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll b/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll index bf706f3122e3a..d751fa1a1e190 100644 --- a/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll +++ b/llvm/test/CodeGen/AArch64/sve-fix-length-and-combine-512.ll @@ -9,6 +9,8 @@ define void @vls_sve_and_64xi8(ptr %ap, ptr %out) nounwind { ; CHECK-NEXT: add x8, x8, :lo12:.LCPI0_0 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x8] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll index fb494afa11de2..9ddf24f3c2f9e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-bitselect.ll @@ -13,15 +13,22 @@ define void @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ ; CHECK-LABEL: fixed_bitselect_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl8 -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x2] -; CHECK-NEXT: add z1.s, z0.s, z1.s -; CHECK-NEXT: subr z0.s, z0.s, #0 // =0x0 +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] +; CHECK-NEXT: mov z3.s, p0/m, #0 // =0x0 +; CHECK-NEXT: mov z4.s, p0/m, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sub z3.s, z3.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s +; CHECK-NEXT: add z0.s, z0.s, z4.s +; CHECK-NEXT: and z1.d, z3.d, z1.d ; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: orr z0.d, z1.d, z0.d +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x3] ; CHECK-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll index 37450431d8a11..63903034dddb4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll @@ -47,8 +47,10 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: and z1.h, z1.h, #0x8000 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: and z1.h, z1.h, #0x8000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -68,10 +70,14 @@ define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: and z1.h, z1.h, #0x8000 +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x7fff -; VBITS_GE_256-NEXT: and z2.h, z2.h, #0x7fff +; VBITS_GE_256-NEXT: and z1.h, z1.h, #0x8000 ; VBITS_GE_256-NEXT: and z3.h, z3.h, #0x8000 +; VBITS_GE_256-NEXT: and z2.h, z2.h, #0x7fff ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -83,8 +89,10 @@ define void @test_copysign_v32f16_v32f16(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: and z1.h, z1.h, #0x8000 +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x7fff +; VBITS_GE_512-NEXT: and z1.h, z1.h, #0x8000 ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -101,8 +109,10 @@ define void @test_copysign_v64f16_v64f16(ptr %ap, ptr %bp) vscale_range(8,0) #0 ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: and z1.h, z1.h, #0x8000 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: and z1.h, z1.h, #0x8000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -119,8 +129,10 @@ define void @test_copysign_v128f16_v128f16(ptr %ap, ptr %bp) vscale_range(16,0) ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: and z1.h, z1.h, #0x8000 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: and z1.h, z1.h, #0x8000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -171,8 +183,10 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -192,10 +206,14 @@ define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: and z1.s, z1.s, #0x80000000 +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: and z0.s, z0.s, #0x7fffffff -; VBITS_GE_256-NEXT: and z2.s, z2.s, #0x7fffffff +; VBITS_GE_256-NEXT: and z1.s, z1.s, #0x80000000 ; VBITS_GE_256-NEXT: and z3.s, z3.s, #0x80000000 +; VBITS_GE_256-NEXT: and z2.s, z2.s, #0x7fffffff ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -207,8 +225,10 @@ define void @test_copysign_v16f32_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: and z1.s, z1.s, #0x80000000 +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: and z0.s, z0.s, #0x7fffffff +; VBITS_GE_512-NEXT: and z1.s, z1.s, #0x80000000 ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -225,8 +245,10 @@ define void @test_copysign_v32f32_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -243,8 +265,10 @@ define void @test_copysign_v64f32_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: and z1.s, z1.s, #0x80000000 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -280,8 +304,10 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -301,10 +327,14 @@ define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: and z1.d, z1.d, #0x8000000000000000 +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: and z0.d, z0.d, #0x7fffffffffffffff -; VBITS_GE_256-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; VBITS_GE_256-NEXT: and z1.d, z1.d, #0x8000000000000000 ; VBITS_GE_256-NEXT: and z3.d, z3.d, #0x8000000000000000 +; VBITS_GE_256-NEXT: and z2.d, z2.d, #0x7fffffffffffffff ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -316,8 +346,10 @@ define void @test_copysign_v8f64_v8f64(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: and z1.d, z1.d, #0x8000000000000000 +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; VBITS_GE_512-NEXT: and z1.d, z1.d, #0x8000000000000000 ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -334,8 +366,10 @@ define void @test_copysign_v16f64_v16f64(ptr %ap, ptr %bp) vscale_range(8,0) #0 ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -352,8 +386,10 @@ define void @test_copysign_v32f64_v32f64(ptr %ap, ptr %bp) vscale_range(16,0) #0 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -439,8 +475,9 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK-NEXT: fcvt z0.d, p0/m, z0.s -; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, #0x8000000000000000 +; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll index d1e9dc13f50e8..59f989673f291 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -35,14 +35,16 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: sel z1.h, p0, z2.h, z0.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b @@ -55,32 +57,38 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov z0.h, w2 -; VBITS_GE_256-NEXT: ptrue p0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h -; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z3.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z4.h, z0.h +; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov z0.h, w2 -; VBITS_GE_512-NEXT: ptrue p0.h -; VBITS_GE_512-NEXT: ptrue p1.h, vl32 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z1.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z2.h, z0.h +; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <32 x half>, ptr %a %op2 = load volatile <32 x half>, ptr %b @@ -93,14 +101,16 @@ define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: sel z1.h, p0, z2.h, z0.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x half>, ptr %a %op2 = load volatile <64 x half>, ptr %b @@ -113,14 +123,16 @@ define void @select_v128f16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.h, vl128 +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: sel z1.h, p0, z2.h, z0.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x half>, ptr %a %op2 = load volatile <128 x half>, ptr %b @@ -158,15 +170,17 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ptrue p1.s, vl8 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -178,33 +192,39 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w8, w2, #0x1 -; VBITS_GE_256-NEXT: ptrue p0.s +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: mov z0.s, w8 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s -; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z3.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z4.s, z0.s +; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 -; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z0.s, w8 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 -; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z1.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z2.s, z0.s +; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <16 x float>, ptr %a %op2 = load volatile <16 x float>, ptr %b @@ -216,15 +236,17 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v32f32: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x float>, ptr %a %op2 = load volatile <32 x float>, ptr %b @@ -236,15 +258,17 @@ define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v64f32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v64f32: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ptrue p1.s, vl64 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x float>, ptr %a %op2 = load volatile <64 x float>, ptr %b @@ -282,16 +306,18 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ptrue p1.d, vl4 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b @@ -303,35 +329,41 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_256-NEXT: and x8, x2, #0x1 -; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: mov z0.d, x8 -; VBITS_GE_256-NEXT: ptrue p1.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d -; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d -; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z3.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z4.d, z0.d +; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_512-NEXT: and x8, x2, #0x1 -; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z0.d, x8 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z1.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z2.d, z0.d +; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <8 x double>, ptr %a %op2 = load volatile <8 x double>, ptr %b @@ -343,16 +375,18 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x double>, ptr %a %op2 = load volatile <16 x double>, ptr %b @@ -364,16 +398,18 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32f64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x double>, ptr %a %op2 = load volatile <32 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll index af54b146c5b66..0a3dacd527edb 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -374,7 +374,9 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzu z0.s, p0/m, z0.s -; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptoui <16 x float> %op1 to <16 x i16> @@ -388,7 +390,9 @@ define void @fcvtzu_v32f32_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %res = fptoui <32 x float> %op1 to <32 x i16> @@ -402,7 +406,9 @@ define void @fcvtzu_v64f32_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %res = fptoui <64 x float> %op1 to <64 x i16> @@ -684,7 +690,10 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptoui <16 x double> %op1 to <16 x i16> @@ -698,7 +707,10 @@ define void @fcvtzu_v32f64_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %res = fptoui <32 x double> %op1 to <32 x i16> @@ -769,7 +781,9 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzu z0.d, p0/m, z0.d -; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i32> @@ -783,7 +797,9 @@ define void @fcvtzu_v16f64_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptoui <16 x double> %op1 to <16 x i32> @@ -797,7 +813,9 @@ define void @fcvtzu_v32f64_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d -; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %res = fptoui <32 x double> %op1 to <32 x i32> @@ -1267,7 +1285,9 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzs z0.s, p0/m, z0.s -; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptosi <16 x float> %op1 to <16 x i16> @@ -1281,7 +1301,9 @@ define void @fcvtzs_v32f32_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %res = fptosi <32 x float> %op1 to <32 x i16> @@ -1295,7 +1317,9 @@ define void @fcvtzs_v64f32_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %res = fptosi <64 x float> %op1 to <64 x i16> @@ -1577,7 +1601,10 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptosi <16 x double> %op1 to <16 x i16> @@ -1591,7 +1618,10 @@ define void @fcvtzs_v32f64_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %res = fptosi <32 x double> %op1 to <32 x i16> @@ -1662,7 +1692,9 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcvtzs z0.d, p0/m, z0.d -; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i32> @@ -1676,7 +1708,9 @@ define void @fcvtzs_v16f64_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptosi <16 x double> %op1 to <16 x i32> @@ -1690,7 +1724,9 @@ define void @fcvtzs_v32f64_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d -; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %res = fptosi <32 x double> %op1 to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll index 9efe0b33910c8..aa338b12082e8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-vselect.ll @@ -36,7 +36,13 @@ define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -57,10 +63,21 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h +; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 +; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 +; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -70,7 +87,13 @@ define void @select_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -88,7 +111,13 @@ define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -106,7 +135,13 @@ define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -149,7 +184,13 @@ define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -170,10 +211,21 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s +; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 +; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 +; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -183,7 +235,13 @@ define void @select_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -201,7 +259,13 @@ define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -219,7 +283,13 @@ define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -263,7 +333,13 @@ define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -284,10 +360,21 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d +; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 +; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -297,7 +384,13 @@ define void @select_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -315,7 +408,13 @@ define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -333,7 +432,13 @@ define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll index 58fca3a2cf8b6..297b197d6775b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -35,6 +35,8 @@ define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -54,6 +56,10 @@ define void @add_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z3.b, z0.b ; VBITS_GE_256-NEXT: add z0.b, z0.b, z1.b ; VBITS_GE_256-NEXT: add z1.b, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] @@ -65,6 +71,8 @@ define void @add_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -81,6 +89,8 @@ define void @add_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -97,6 +107,8 @@ define void @add_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -133,6 +145,8 @@ define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -152,6 +166,10 @@ define void @add_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: add z0.h, z0.h, z1.h ; VBITS_GE_256-NEXT: add z1.h, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -163,6 +181,8 @@ define void @add_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -179,6 +199,8 @@ define void @add_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -195,6 +217,8 @@ define void @add_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -231,6 +255,8 @@ define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -250,6 +276,10 @@ define void @add_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: add z0.s, z0.s, z1.s ; VBITS_GE_256-NEXT: add z1.s, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -261,6 +291,8 @@ define void @add_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -277,6 +309,8 @@ define void @add_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -293,6 +327,8 @@ define void @add_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -329,6 +365,8 @@ define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -348,6 +386,10 @@ define void @add_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: add z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: add z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -359,6 +401,8 @@ define void @add_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -375,6 +419,8 @@ define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -394,6 +440,10 @@ define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d +; CHECK-NEXT: sel z2.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z3.d, p0, z3.d, z0.d ; CHECK-NEXT: add z0.d, z0.d, z1.d ; CHECK-NEXT: add z1.d, z2.d, z3.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -842,6 +892,8 @@ define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -861,6 +913,10 @@ define void @sub_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z3.b, z0.b ; VBITS_GE_256-NEXT: sub z0.b, z0.b, z1.b ; VBITS_GE_256-NEXT: sub z1.b, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] @@ -872,6 +928,8 @@ define void @sub_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -888,6 +946,8 @@ define void @sub_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -904,6 +964,8 @@ define void @sub_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -940,6 +1002,8 @@ define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -959,6 +1023,10 @@ define void @sub_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h ; VBITS_GE_256-NEXT: sub z1.h, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -970,6 +1038,8 @@ define void @sub_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -986,6 +1056,8 @@ define void @sub_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -1002,6 +1074,8 @@ define void @sub_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -1038,6 +1112,8 @@ define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1057,6 +1133,10 @@ define void @sub_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s ; VBITS_GE_256-NEXT: sub z1.s, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -1068,6 +1148,8 @@ define void @sub_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -1084,6 +1166,8 @@ define void @sub_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1100,6 +1184,8 @@ define void @sub_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1136,6 +1222,8 @@ define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -1155,6 +1243,10 @@ define void @sub_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: sub z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -1166,6 +1258,8 @@ define void @sub_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -1182,6 +1276,8 @@ define void @sub_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -1198,6 +1294,8 @@ define void @sub_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll index 0ddf434eff930..6e451d29a3f0d 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -161,15 +161,17 @@ define void @sdiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -182,15 +184,17 @@ define void @sdiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: sdiv_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b @@ -203,9 +207,9 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: sdiv_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z1.h @@ -214,14 +218,14 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: st1b { z1.h }, p0, [x0] +; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -414,13 +418,14 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-LABEL: sdiv_v16i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -433,13 +438,14 @@ define void @sdiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: sdiv_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b @@ -452,13 +458,14 @@ define void @sdiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: sdiv_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -893,11 +900,17 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { define void @udiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: udiv_v32i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -909,11 +922,17 @@ define void @udiv_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @udiv_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: udiv_v64i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] ; CHECK-NEXT: ptrue p0.s, vl64 -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] -; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p0.b, vl64 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b @@ -926,22 +945,24 @@ define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: udiv_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s -; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h -; CHECK-NEXT: st1b { z1.h }, p0, [x0] +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -1137,7 +1158,9 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.h, vl16 +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -1153,7 +1176,9 @@ define void @udiv_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b @@ -1169,7 +1194,9 @@ define void @udiv_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index 4feb86305f8f6..624b621065101 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -75,6 +75,7 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b @@ -88,6 +89,7 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl32 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b @@ -105,6 +107,7 @@ define void @sext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: sunpklo z0.h, z0.b @@ -122,6 +125,7 @@ define void @sext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: sunpklo z0.h, z0.b @@ -185,6 +189,7 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: sunpklo z0.h, z0.b @@ -203,6 +208,7 @@ define void @sext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: sunpklo z0.h, z0.b @@ -287,6 +293,7 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: sunpklo z0.h, z0.b @@ -324,6 +331,7 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h @@ -337,6 +345,7 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h @@ -354,6 +363,7 @@ define void @sext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -371,6 +381,7 @@ define void @sext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -434,6 +445,7 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -452,6 +464,7 @@ define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -488,6 +501,7 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s @@ -501,6 +515,7 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s @@ -518,6 +533,7 @@ define void @sext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: sunpklo z0.d, z0.s @@ -535,6 +551,7 @@ define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: sunpklo z0.d, z0.s @@ -571,6 +588,7 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b @@ -584,6 +602,7 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl32 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z0.b ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b @@ -601,6 +620,7 @@ define void @zext_v64i8_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b @@ -618,6 +638,7 @@ define void @zext_v128i8_v128i16(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b @@ -681,6 +702,7 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b @@ -699,6 +721,7 @@ define void @zext_v64i8_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b @@ -783,6 +806,7 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: uunpklo z0.h, z0.b @@ -820,6 +844,7 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -833,6 +858,7 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h @@ -850,6 +876,7 @@ define void @zext_v32i16_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -867,6 +894,7 @@ define void @zext_v64i16_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -930,6 +958,7 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -948,6 +977,7 @@ define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -984,6 +1014,7 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -997,6 +1028,7 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s @@ -1014,6 +1046,7 @@ define void @zext_v16i32_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: uunpklo z0.d, z0.s @@ -1031,6 +1064,7 @@ define void @zext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: uunpklo z0.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll index 572759211fc1c..567f952d92948 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll @@ -16,6 +16,7 @@ define void @add_v64i8(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: add z0.b, z0.b, #7 // =0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -30,6 +31,7 @@ define void @add_v32i16(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: add z0.h, z0.h, #15 // =0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -44,6 +46,7 @@ define void @add_v16i32(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: add z0.s, z0.s, #31 // =0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -58,6 +61,7 @@ define void @add_v8i64(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: add z0.d, z0.d, #63 // =0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -76,6 +80,7 @@ define void @and_v64i8(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: and z0.b, z0.b, #0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -90,6 +95,7 @@ define void @and_v32i16(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: and z0.h, z0.h, #0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -104,6 +110,7 @@ define void @and_v16i32(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: and z0.s, z0.s, #0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -118,6 +125,7 @@ define void @and_v8i64(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: and z0.d, z0.d, #0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -384,6 +392,7 @@ define void @or_v64i8(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: orr z0.b, z0.b, #0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -398,6 +407,7 @@ define void @or_v32i16(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: orr z0.h, z0.h, #0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -412,6 +422,7 @@ define void @or_v16i32(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: orr z0.s, z0.s, #0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -426,6 +437,7 @@ define void @or_v8i64(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: orr z0.d, z0.d, #0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -624,6 +636,7 @@ define void @sub_v64i8(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: sub z0.b, z0.b, #7 // =0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -638,6 +651,7 @@ define void @sub_v32i16(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: sub z0.h, z0.h, #15 // =0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -652,6 +666,7 @@ define void @sub_v16i32(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: sub z0.s, z0.s, #31 // =0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -666,6 +681,7 @@ define void @sub_v8i64(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: sub z0.d, z0.d, #63 // =0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -804,6 +820,7 @@ define void @xor_v64i8(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: eor z0.b, z0.b, #0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -818,6 +835,7 @@ define void @xor_v32i16(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: eor z0.h, z0.h, #0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -832,6 +850,7 @@ define void @xor_v16i32(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: eor z0.s, z0.s, #0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -846,6 +865,7 @@ define void @xor_v8i64(ptr %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: eor z0.d, z0.d, #0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll index 1285a5783677e..ecf9b3244d677 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-log.ll @@ -35,6 +35,8 @@ define void @and_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -54,6 +56,10 @@ define void @and_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z3.b, z0.b ; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: and z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] @@ -65,6 +71,8 @@ define void @and_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -81,6 +89,8 @@ define void @and_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -97,6 +107,8 @@ define void @and_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -133,6 +145,8 @@ define void @and_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -152,6 +166,10 @@ define void @and_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: and z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -163,6 +181,8 @@ define void @and_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -179,6 +199,8 @@ define void @and_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -195,6 +217,8 @@ define void @and_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -231,6 +255,8 @@ define void @and_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -250,6 +276,10 @@ define void @and_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: and z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -261,6 +291,8 @@ define void @and_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -277,6 +309,8 @@ define void @and_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -293,6 +327,8 @@ define void @and_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -329,6 +365,8 @@ define void @and_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -348,6 +386,10 @@ define void @and_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: and z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -359,6 +401,8 @@ define void @and_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: and z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -375,6 +419,8 @@ define void @and_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -391,6 +437,8 @@ define void @and_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -431,6 +479,8 @@ define void @or_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -450,6 +500,10 @@ define void @or_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z3.b, z0.b ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] @@ -461,6 +515,8 @@ define void @or_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -477,6 +533,8 @@ define void @or_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -493,6 +551,8 @@ define void @or_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -529,6 +589,8 @@ define void @or_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -548,6 +610,10 @@ define void @or_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -559,6 +625,8 @@ define void @or_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -575,6 +643,8 @@ define void @or_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -591,6 +661,8 @@ define void @or_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -627,6 +699,8 @@ define void @or_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -646,6 +720,10 @@ define void @or_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -657,6 +735,8 @@ define void @or_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -673,6 +753,8 @@ define void @or_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -689,6 +771,8 @@ define void @or_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -725,6 +809,8 @@ define void @or_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -744,6 +830,10 @@ define void @or_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: orr z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -755,6 +845,8 @@ define void @or_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: orr z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -771,6 +863,8 @@ define void @or_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -787,6 +881,8 @@ define void @or_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -827,6 +923,8 @@ define void @xor_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -846,6 +944,10 @@ define void @xor_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z3.b, z0.b ; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: eor z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] @@ -857,6 +959,8 @@ define void @xor_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -873,6 +977,8 @@ define void @xor_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -889,6 +995,8 @@ define void @xor_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -925,6 +1033,8 @@ define void @xor_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -944,6 +1054,10 @@ define void @xor_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: eor z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -955,6 +1069,8 @@ define void @xor_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -971,6 +1087,8 @@ define void @xor_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -987,6 +1105,8 @@ define void @xor_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -1023,6 +1143,8 @@ define void @xor_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1042,6 +1164,10 @@ define void @xor_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: eor z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -1053,6 +1179,8 @@ define void @xor_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -1069,6 +1197,8 @@ define void @xor_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1085,6 +1215,8 @@ define void @xor_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1121,6 +1253,8 @@ define void @xor_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -1140,6 +1274,10 @@ define void @xor_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_256-NEXT: eor z1.d, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -1151,6 +1289,8 @@ define void @xor_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: eor z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -1167,6 +1307,8 @@ define void @xor_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -1183,6 +1325,8 @@ define void @xor_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll index be19e9ef5e86f..ac9536b9b40d7 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll @@ -51,6 +51,8 @@ define i8 @uaddv_v64i8(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_256-NEXT: add z0.b, z1.b, z0.b ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.b ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -136,6 +138,8 @@ define i16 @uaddv_v32i16(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: add z0.h, z1.h, z0.h ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.h ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -221,6 +225,8 @@ define i32 @uaddv_v16i32(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_256-NEXT: add z0.s, z1.s, z0.s ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.s ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -306,6 +312,8 @@ define i64 @uaddv_v8i64(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_256-NEXT: add z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: uaddv d0, p0, z0.d ; VBITS_GE_256-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index 2d78945399176..829a60a8d23e5 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -170,6 +170,7 @@ define void @srem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z2.h @@ -177,7 +178,8 @@ define void @srem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -194,6 +196,7 @@ define void @srem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z2.h @@ -201,7 +204,8 @@ define void @srem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a @@ -218,11 +222,12 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h @@ -233,7 +238,8 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a @@ -250,11 +256,12 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h @@ -285,7 +292,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b ; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a @@ -418,6 +426,7 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h ; VBITS_GE_256-NEXT: mov z4.d, z0.d @@ -432,7 +441,8 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h -; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z2.h +; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; @@ -442,11 +452,13 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h ; VBITS_GE_512-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h -; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_512-NEXT: mul z1.h, p0/m, z1.h, z2.h +; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -463,11 +475,13 @@ define void @srem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a @@ -484,11 +498,13 @@ define void @srem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a @@ -505,6 +521,7 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: mov z4.d, z0.d @@ -519,7 +536,8 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a @@ -567,9 +585,11 @@ define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a @@ -608,16 +628,20 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: movprfx z2, z0 ; VBITS_GE_256-NEXT: sdiv z2.s, p0/m, z2.s, z1.s ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: sdiv z5.s, p0/m, z5.s, z4.s -; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z2.s +; VBITS_GE_256-NEXT: movprfx z2, z5 +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z4.s +; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_256-NEXT: sub z1.s, z3.s, z2.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -627,9 +651,11 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s ; VBITS_GE_512-NEXT: movprfx z2, z0 ; VBITS_GE_512-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: mul z1.s, p0/m, z1.s, z2.s +; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a @@ -645,9 +671,11 @@ define void @srem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a @@ -663,9 +691,11 @@ define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a @@ -715,9 +745,11 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a @@ -757,16 +789,20 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: movprfx z2, z0 ; VBITS_GE_256-NEXT: sdiv z2.d, p0/m, z2.d, z1.d ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: sdiv z5.d, p0/m, z5.d, z4.d -; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z2.d +; VBITS_GE_256-NEXT: movprfx z2, z5 +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z4.d +; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: sub z1.d, z3.d, z2.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -776,9 +812,11 @@ define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d ; VBITS_GE_512-NEXT: movprfx z2, z0 ; VBITS_GE_512-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: mul z1.d, p0/m, z1.d, z2.d +; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, ptr %a @@ -794,9 +832,11 @@ define void @srem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a @@ -812,9 +852,11 @@ define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i64>, ptr %a @@ -988,6 +1030,7 @@ define void @urem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z2.h @@ -995,7 +1038,8 @@ define void @urem_v32i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -1012,6 +1056,7 @@ define void @urem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z2.h @@ -1019,7 +1064,8 @@ define void @urem_v64i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a @@ -1036,11 +1082,12 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h @@ -1051,7 +1098,8 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h ; CHECK-NEXT: uzp1 z2.b, z3.b, z3.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a @@ -1068,11 +1116,12 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 +; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h @@ -1103,7 +1152,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b ; CHECK-NEXT: splice z2.b, p1, z2.b, z3.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, p0/m, z1.b, z2.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <256 x i8>, ptr %a @@ -1236,6 +1286,7 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_256-NEXT: mov z4.d, z0.d @@ -1250,7 +1301,8 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: splice z2.h, p1, z2.h, z3.h -; VBITS_GE_256-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z2.h +; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; @@ -1260,11 +1312,13 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h ; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_512-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h -; VBITS_GE_512-NEXT: mls z0.h, p0/m, z2.h, z1.h +; VBITS_GE_512-NEXT: mul z1.h, p0/m, z1.h, z2.h +; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -1281,11 +1335,13 @@ define void @urem_v32i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a @@ -1302,11 +1358,13 @@ define void @urem_v64i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a @@ -1323,6 +1381,7 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: mov z4.d, z0.d @@ -1337,7 +1396,8 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p1, z2.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i16>, ptr %a @@ -1385,9 +1445,11 @@ define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, ptr %a @@ -1426,16 +1488,20 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: movprfx z2, z0 ; VBITS_GE_256-NEXT: udiv z2.s, p0/m, z2.s, z1.s ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: udiv z5.s, p0/m, z5.s, z4.s -; VBITS_GE_256-NEXT: mls z0.s, p0/m, z2.s, z1.s -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.s, p0/m, z5.s, z4.s +; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z2.s +; VBITS_GE_256-NEXT: movprfx z2, z5 +; VBITS_GE_256-NEXT: mul z2.s, p0/m, z2.s, z4.s +; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s +; VBITS_GE_256-NEXT: sub z1.s, z3.s, z2.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -1445,9 +1511,11 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s ; VBITS_GE_512-NEXT: movprfx z2, z0 ; VBITS_GE_512-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; VBITS_GE_512-NEXT: mls z0.s, p0/m, z2.s, z1.s +; VBITS_GE_512-NEXT: mul z1.s, p0/m, z1.s, z2.s +; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, ptr %a @@ -1463,9 +1531,11 @@ define void @urem_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a @@ -1481,9 +1551,11 @@ define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, p0/m, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i32>, ptr %a @@ -1533,9 +1605,11 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, ptr %a @@ -1575,16 +1649,20 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: movprfx z2, z0 ; VBITS_GE_256-NEXT: udiv z2.d, p0/m, z2.d, z1.d ; VBITS_GE_256-NEXT: movprfx z5, z3 ; VBITS_GE_256-NEXT: udiv z5.d, p0/m, z5.d, z4.d -; VBITS_GE_256-NEXT: mls z0.d, p0/m, z2.d, z1.d -; VBITS_GE_256-NEXT: movprfx z1, z3 -; VBITS_GE_256-NEXT: mls z1.d, p0/m, z5.d, z4.d +; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z2.d +; VBITS_GE_256-NEXT: movprfx z2, z5 +; VBITS_GE_256-NEXT: mul z2.d, p0/m, z2.d, z4.d +; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d +; VBITS_GE_256-NEXT: sub z1.d, z3.d, z2.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -1594,9 +1672,11 @@ define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d ; VBITS_GE_512-NEXT: movprfx z2, z0 ; VBITS_GE_512-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; VBITS_GE_512-NEXT: mls z0.d, p0/m, z2.d, z1.d +; VBITS_GE_512-NEXT: mul z1.d, p0/m, z1.d, z2.d +; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, ptr %a @@ -1612,9 +1692,11 @@ define void @urem_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a @@ -1630,9 +1712,11 @@ define void @urem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, p0/m, z1.d, z2.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i64>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll index 37396ba7011be..839e3794288af 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -34,14 +34,16 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ptrue p1.b, vl32 -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] -; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x0] +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 +; CHECK-NEXT: mov z0.b, p0/m, z1.b +; CHECK-NEXT: sel z1.b, p0, z2.b, z0.b +; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b @@ -53,31 +55,37 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov z0.b, w2 -; VBITS_GE_256-NEXT: ptrue p0.b +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 -; VBITS_GE_256-NEXT: ptrue p1.b, vl32 -; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z1.b }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p1/z, [x1, x8] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.b, p0, z0.b, z2.b -; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z3.b -; VBITS_GE_256-NEXT: st1b { z0.b }, p1, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x0] +; VBITS_GE_256-NEXT: mov z0.b, w2 +; VBITS_GE_256-NEXT: ptrue p1.b +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z4.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0 +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z1.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z3.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z4.b, z0.b +; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b +; VBITS_GE_256-NEXT: sel z1.b, p1, z2.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v64i8: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: mov z0.b, w2 -; VBITS_GE_512-NEXT: ptrue p0.b -; VBITS_GE_512-NEXT: ptrue p1.b, vl64 -; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.b }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.b, p0, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p1, [x0] +; VBITS_GE_512-NEXT: ptrue p1.b +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z0.b, #0 +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z1.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z2.b, z0.b +; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <64 x i8>, ptr %a %op2 = load volatile <64 x i8>, ptr %b @@ -89,14 +97,16 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v128i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ptrue p1.b, vl128 -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] -; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x0] +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 +; CHECK-NEXT: mov z0.b, p0/m, z1.b +; CHECK-NEXT: sel z1.b, p0, z2.b, z0.b +; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x i8>, ptr %a %op2 = load volatile <128 x i8>, ptr %b @@ -108,14 +118,16 @@ define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v256i8(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v256i8: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: ptrue p1.b, vl256 -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] -; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x0] +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 +; CHECK-NEXT: mov z0.b, p0/m, z1.b +; CHECK-NEXT: sel z1.b, p0, z2.b, z0.b +; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <256 x i8>, ptr %a %op2 = load volatile <256 x i8>, ptr %b @@ -154,14 +166,16 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: sel z1.h, p0, z2.h, z0.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b @@ -174,32 +188,38 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: mov z0.h, w2 -; VBITS_GE_256-NEXT: ptrue p0.h +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h -; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h -; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] +; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z3.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z4.h, z0.h +; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: mov z0.h, w2 -; VBITS_GE_512-NEXT: ptrue p0.h -; VBITS_GE_512-NEXT: ptrue p1.h, vl32 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z1.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z2.h, z0.h +; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <32 x i16>, ptr %a %op2 = load volatile <32 x i16>, ptr %b @@ -212,14 +232,16 @@ define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: sel z1.h, p0, z2.h, z0.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x i16>, ptr %a %op2 = load volatile <64 x i16>, ptr %b @@ -232,14 +254,16 @@ define void @select_v128i16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v128i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.h, vl128 +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] -; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x0] +; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 +; CHECK-NEXT: mov z0.h, p0/m, z1.h +; CHECK-NEXT: sel z1.h, p0, z2.h, z0.h +; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x i16>, ptr %a %op2 = load volatile <128 x i16>, ptr %b @@ -277,15 +301,17 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ptrue p1.s, vl8 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b @@ -297,33 +323,39 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w8, w2, #0x1 -; VBITS_GE_256-NEXT: ptrue p0.s +; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 ; VBITS_GE_256-NEXT: mov z0.s, w8 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 -; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s -; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s -; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x9, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z3.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z4.s, z0.s +; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16i32: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 -; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z0.s, w8 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 -; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z1.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z2.s, z0.s +; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <16 x i32>, ptr %a %op2 = load volatile <16 x i32>, ptr %b @@ -335,15 +367,17 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v32i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ptrue p1.s, vl32 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i32>, ptr %a %op2 = load volatile <32 x i32>, ptr %b @@ -355,15 +389,17 @@ define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v64i32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v64i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ptrue p1.s, vl64 -; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] -; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 +; CHECK-NEXT: mov z0.s, p0/m, z1.s +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x i32>, ptr %a %op2 = load volatile <64 x i32>, ptr %b @@ -401,16 +437,18 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ptrue p1.d, vl4 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b @@ -422,35 +460,41 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_256-NEXT: and x8, x2, #0x1 -; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: mov x9, #4 // =0x4 ; VBITS_GE_256-NEXT: mov z0.d, x8 -; VBITS_GE_256-NEXT: ptrue p1.d, vl4 -; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] -; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d -; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d -; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x9, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z3.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z4.d, z0.d +; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x9, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8i64: ; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_512-NEXT: and x8, x2, #0x1 -; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z0.d, x8 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z1.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z2.d, z0.d +; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <8 x i64>, ptr %a %op2 = load volatile <8 x i64>, ptr %b @@ -462,16 +506,18 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ptrue p1.d, vl16 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i64>, ptr %a %op2 = load volatile <16 x i64>, ptr %b @@ -483,16 +529,18 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ptrue p1.d, vl32 -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] -; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] +; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: sel z1.d, p0, z2.d, z0.d +; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i64>, ptr %a %op2 = load volatile <32 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 1d9e01f4ecfdf..ee0b50de05b18 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -687,9 +687,10 @@ define void @ucvtf_v16i64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a %res = uitofp <16 x i64> %op1 to <16 x half> @@ -703,9 +704,10 @@ define void @ucvtf_v32i64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x i64>, ptr %a %res = uitofp <32 x i64> %op1 to <32 x half> @@ -1618,9 +1620,10 @@ define void @scvtf_v16i64_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a %res = sitofp <16 x i64> %op1 to <16 x half> @@ -1634,9 +1637,10 @@ define void @scvtf_v32i64_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: scvtf z0.h, p0/m, z0.d -; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %op1 = load <32 x i64>, ptr %a %res = sitofp <32 x i64> %op1 to <32 x half> diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll index 9cebbc4aab9b7..d4625a8b01bbc 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-vselect.ll @@ -35,7 +35,13 @@ define void @select_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -56,10 +62,21 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: sel z2.b, p0, z2.b, z0.b +; VBITS_GE_256-NEXT: sel z3.b, p0, z3.b, z0.b ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b -; VBITS_GE_256-NEXT: sel z0.b, p1, z0.b, z1.b -; VBITS_GE_256-NEXT: sel z1.b, p2, z2.b, z3.b +; VBITS_GE_256-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.b, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.b +; VBITS_GE_256-NEXT: and z4.b, z4.b, #0x1 +; VBITS_GE_256-NEXT: and z5.b, z5.b, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.b, p1/z, z4.b, #0 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z5.b, #0 +; VBITS_GE_256-NEXT: sel z0.b, p2, z0.b, z1.b +; VBITS_GE_256-NEXT: sel z1.b, p1, z2.b, z3.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8] ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -69,7 +86,13 @@ define void @select_v64i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_512-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; VBITS_GE_512-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.b +; VBITS_GE_512-NEXT: and z2.b, z2.b, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; VBITS_GE_512-NEXT: sel z0.b, p1, z0.b, z1.b ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -87,7 +110,13 @@ define void @select_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -105,7 +134,13 @@ define void @select_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; CHECK-NEXT: mov z2.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: and z2.b, z2.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p1/z, z2.b, #0 ; CHECK-NEXT: sel z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret @@ -148,7 +183,13 @@ define void @select_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -169,10 +210,21 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h +; VBITS_GE_256-NEXT: sel z2.h, p0, z2.h, z0.h +; VBITS_GE_256-NEXT: sel z3.h, p0, z3.h, z0.h ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h -; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h -; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h +; VBITS_GE_256-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.h, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: and z4.h, z4.h, #0x1 +; VBITS_GE_256-NEXT: and z5.h, z5.h, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.h, p1/z, z4.h, #0 +; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z5.h, #0 +; VBITS_GE_256-NEXT: sel z0.h, p2, z0.h, z1.h +; VBITS_GE_256-NEXT: sel z1.h, p1, z2.h, z3.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -182,7 +234,13 @@ define void @select_v32i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_512-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: and z2.h, z2.h, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -200,7 +258,13 @@ define void @select_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -218,7 +282,13 @@ define void @select_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: and z2.h, z2.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p1/z, z2.h, #0 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -261,7 +331,13 @@ define void @select_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -282,10 +358,21 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: sel z2.s, p0, z2.s, z0.s +; VBITS_GE_256-NEXT: sel z3.s, p0, z3.s, z0.s ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s -; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s -; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s +; VBITS_GE_256-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: and z4.s, z4.s, #0x1 +; VBITS_GE_256-NEXT: and z5.s, z5.s, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.s, p1/z, z4.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z5.s, #0 +; VBITS_GE_256-NEXT: sel z0.s, p2, z0.s, z1.s +; VBITS_GE_256-NEXT: sel z1.s, p1, z2.s, z3.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -295,7 +382,13 @@ define void @select_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_512-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: and z2.s, z2.s, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -313,7 +406,13 @@ define void @select_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -331,7 +430,13 @@ define void @select_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z2.s, z2.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p1/z, z2.s, #0 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -375,7 +480,13 @@ define void @select_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -396,10 +507,21 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d +; VBITS_GE_256-NEXT: sel z2.d, p0, z2.d, z0.d +; VBITS_GE_256-NEXT: sel z3.d, p0, z3.d, z0.d ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d -; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d -; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d +; VBITS_GE_256-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: and z4.d, z4.d, #0x1 +; VBITS_GE_256-NEXT: and z5.d, z5.d, #0x1 +; VBITS_GE_256-NEXT: cmpne p2.d, p1/z, z4.d, #0 +; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z5.d, #0 +; VBITS_GE_256-NEXT: sel z0.d, p2, z0.d, z1.d +; VBITS_GE_256-NEXT: sel z1.d, p1, z2.d, z3.d ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -409,7 +531,13 @@ define void @select_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_512-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: and z2.d, z2.d, #0x1 +; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -427,7 +555,13 @@ define void @select_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -445,7 +579,13 @@ define void @select_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: and z2.d, z2.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll index 78b41f71f0ea2..055c83b8cdceb 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll @@ -9,13 +9,15 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: add z1.s, z0.s, z0.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: add z2.s, z0.s, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 -; CHECK-NEXT: add z2.s, z2.s, z2.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: dup v0.4s, v0.s[2] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: add z1.s, z1.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 @@ -32,13 +34,15 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: add z1.s, z0.s, z0.s +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: add z2.s, z0.s, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #24 -; CHECK-NEXT: add z2.s, z2.s, z2.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: dup v0.2s, v0.s[0] -; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: add z1.s, z1.s, z1.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll index d0585274a43e3..47e68b06814e9 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll @@ -55,6 +55,8 @@ define i8 @andv_v64i8(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: andv b0, p0, z0.b ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -144,6 +146,8 @@ define i16 @andv_v32i16(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: andv h0, p0, z0.h ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -233,6 +237,8 @@ define i32 @andv_v16i32(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: andv s0, p0, z0.s ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -320,6 +326,8 @@ define i64 @andv_v8i64(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: andv d0, p0, z0.d ; VBITS_GE_256-NEXT: fmov x0, d0 @@ -413,6 +421,8 @@ define i8 @eorv_v64i8(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: eorv b0, p0, z0.b ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -502,6 +512,8 @@ define i16 @eorv_v32i16(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: eorv h0, p0, z0.h ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -591,6 +603,8 @@ define i32 @eorv_v16i32(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: eorv s0, p0, z0.s ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -678,6 +692,8 @@ define i64 @eorv_v8i64(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: eorv d0, p0, z0.d ; VBITS_GE_256-NEXT: fmov x0, d0 @@ -771,6 +787,8 @@ define i8 @orv_v64i8(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z0.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z0.b ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: orv b0, p0, z0.b ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -860,6 +878,8 @@ define i16 @orv_v32i16(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z0.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: orv h0, p0, z0.h ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -949,6 +969,8 @@ define i32 @orv_v16i32(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z0.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z0.s ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: orv s0, p0, z0.s ; VBITS_GE_256-NEXT: fmov w0, s0 @@ -1036,6 +1058,8 @@ define i64 @orv_v8i64(ptr %a) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: mov z0.d, p0/m, z0.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z0.d ; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d ; VBITS_GE_256-NEXT: orv d0, p0, z0.d ; VBITS_GE_256-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll index d2206a74533d8..0f20a56f06023 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -31,7 +31,10 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1b { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <4 x ptr>, ptr %b %vals = call <4 x i8> @llvm.masked.gather.v4i8(<4 x ptr> %ptrs, i32 8, <4 x i1> splat (i1 true), <4 x i8> poison) @@ -95,7 +98,11 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1b { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <32 x ptr>, ptr %b %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x ptr> %ptrs, i32 8, <32 x i1> splat (i1 true), <32 x i8> poison) @@ -177,7 +184,10 @@ define void @masked_gather_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1h { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <16 x ptr>, ptr %b %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x ptr> %ptrs, i32 8, <16 x i1> splat (i1 true), <16 x i16> poison) @@ -191,7 +201,10 @@ define void @masked_gather_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1h { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.h, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <32 x ptr>, ptr %b %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x ptr> %ptrs, i32 8, <32 x i1> splat (i1 true), <32 x i16> poison) @@ -255,7 +268,9 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %ptrs = load <8 x ptr>, ptr %b %vals = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %ptrs, i32 8, <8 x i1> splat (i1 true), <8 x i32> poison) @@ -269,7 +284,9 @@ define void @masked_gather_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <16 x ptr>, ptr %b %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x ptr> %ptrs, i32 8, <16 x i1> splat (i1 true), <16 x i32> poison) @@ -283,7 +300,9 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %ptrs = load <32 x ptr>, ptr %b %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x ptr> %ptrs, i32 8, <32 x i1> splat (i1 true), <32 x i32> poison) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 093e6cd9328c8..674921be3d6dc 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -45,8 +45,11 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] -; CHECK-NEXT: st1b { z0.d }, p0, [x0] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cval = load <4 x i8>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -140,15 +143,19 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1b { z0.d }, p1, [x0] +; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x i8>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -262,14 +269,17 @@ define void @masked_gather_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: masked_gather_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ptrue p2.d, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cval = load <16 x i16>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -283,14 +293,17 @@ define void @masked_gather_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x i16>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -374,13 +387,15 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-LABEL: masked_gather_v8i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: ptrue p2.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; VBITS_GE_512-NEXT: st1w { z0.d }, p1, [x0] +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p2/z, [x1] +; VBITS_GE_512-NEXT: punpklo p1.h, p1.b +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p2.b +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %cval = load <8 x i32>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -394,13 +409,15 @@ define void @masked_gather_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: masked_gather_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ptrue p2.d, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cval = load <16 x i32>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -414,13 +431,15 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x i32>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -482,6 +501,7 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -502,9 +522,11 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z1.d] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] @@ -516,6 +538,7 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -534,6 +557,7 @@ define void @masked_gather_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -552,6 +576,7 @@ define void @masked_gather_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -670,14 +695,17 @@ define void @masked_gather_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: masked_gather_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ptrue p2.d, vl16 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cval = load <16 x half>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -691,14 +719,17 @@ define void @masked_gather_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_v32f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1h { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x half>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -782,13 +813,15 @@ define void @masked_gather_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-LABEL: masked_gather_v8f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: ptrue p2.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x1] -; VBITS_GE_512-NEXT: punpklo p0.h, p0.b -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; VBITS_GE_512-NEXT: st1w { z0.d }, p1, [x0] +; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p2/z, [x1] +; VBITS_GE_512-NEXT: punpklo p1.h, p1.b +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p2.b +; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %cval = load <8 x float>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -802,13 +835,15 @@ define void @masked_gather_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: masked_gather_v16f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: ptrue p2.d, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cval = load <16 x float>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -822,13 +857,15 @@ define void @masked_gather_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x float>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -891,6 +928,7 @@ define void @masked_gather_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -911,9 +949,11 @@ define void @masked_gather_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z1.d, #0.0 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z1.d] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] @@ -925,6 +965,7 @@ define void @masked_gather_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -943,6 +984,7 @@ define void @masked_gather_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -961,6 +1003,7 @@ define void @masked_gather_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -979,13 +1022,15 @@ define void @masked_gather_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale ; CHECK-LABEL: masked_gather_32b_scaled_sext_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p2.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, sxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z0.s, sxtw #1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1004,6 +1049,7 @@ define void @masked_gather_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x2, z0.s, sxtw #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -1024,6 +1070,7 @@ define void @masked_gather_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x1] +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [x2, z0.d, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -1041,13 +1088,15 @@ define void @masked_gather_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_ran ; CHECK-LABEL: masked_gather_32b_scaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p2.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, uxtw #1] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z0.s, uxtw #1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1063,13 +1112,15 @@ define void @masked_gather_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_r ; CHECK-LABEL: masked_gather_32b_unscaled_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p2.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, sxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z0.s, sxtw] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1086,13 +1137,15 @@ define void @masked_gather_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_r ; CHECK-LABEL: masked_gather_32b_unscaled_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: ptrue p2.s, vl32 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, uxtw] -; CHECK-NEXT: st1h { z0.s }, p1, [x0] +; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1h { z0.s }, p1/z, [x2, z0.s, uxtw] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x half>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -1109,13 +1162,15 @@ define void @masked_gather_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16 ; CHECK-LABEL: masked_gather_64b_scaled: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z0.d, lsl #2] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z0.d, lsl #2] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %idxs = load <32 x i64>, ptr %b @@ -1130,13 +1185,15 @@ define void @masked_gather_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range( ; CHECK-LABEL: masked_gather_64b_unscaled: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %idxs = load <32 x i64>, ptr %b @@ -1152,13 +1209,15 @@ define void @masked_gather_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(1 ; CHECK-LABEL: masked_gather_vec_plus_reg: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [x2, z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %bases = load <32 x ptr>, ptr %b @@ -1174,13 +1233,15 @@ define void @masked_gather_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_vec_plus_imm: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d, #4] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d, #4] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %bases = load <32 x ptr>, ptr %b @@ -1198,11 +1259,17 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] -; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: punpklo p3.h, p1.b +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and p2.b, p3/z, p3.b, p2.b +; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] +; CHECK-NEXT: cmpne p1.s, p1/z, z1.s, #0 +; CHECK-NEXT: sel z1.s, p0, z2.s, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] @@ -1220,13 +1287,15 @@ define void @masked_gather_passthru_0(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: masked_gather_passthru_0: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: ptrue p2.d, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] -; CHECK-NEXT: st1w { z0.d }, p1, [x0] +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: and p1.b, p1/z, p1.b, p2.b +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %cvals = load <32 x float>, ptr %a %ptrs = load <32 x ptr>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 34dc0bb5ef2d2..2cce7f1fb8b19 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -75,6 +75,7 @@ define void @masked_load_v8f32(ptr %ap, ptr %bp, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -97,6 +98,8 @@ define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] @@ -109,6 +112,7 @@ define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -127,6 +131,7 @@ define void @masked_load_v32f32(ptr %ap, ptr %bp, ptr %c) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -145,6 +150,7 @@ define void @masked_load_v64f32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -167,6 +173,8 @@ define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0] ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x2, x8] @@ -179,6 +187,7 @@ define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -201,6 +210,8 @@ define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] @@ -213,6 +224,7 @@ define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -235,6 +247,8 @@ define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] @@ -247,6 +261,7 @@ define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -269,6 +284,8 @@ define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] @@ -281,6 +298,7 @@ define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -303,6 +321,8 @@ define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x0] ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d @@ -317,6 +337,7 @@ define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] @@ -340,6 +361,8 @@ define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p2.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x0] ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d @@ -354,6 +377,7 @@ define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] @@ -372,7 +396,8 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b @@ -387,6 +412,7 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -421,6 +447,7 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -456,6 +483,7 @@ define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -473,7 +501,8 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h @@ -488,6 +517,7 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -522,6 +552,7 @@ define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -539,7 +570,8 @@ define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s @@ -554,6 +586,7 @@ define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -571,7 +604,8 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b @@ -586,6 +620,7 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -620,6 +655,7 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -655,6 +691,7 @@ define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -672,7 +709,8 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h @@ -687,6 +725,7 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -721,6 +760,7 @@ define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -738,7 +778,8 @@ define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -753,6 +794,7 @@ define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -794,6 +836,7 @@ define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -838,6 +881,7 @@ define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -884,6 +928,7 @@ define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -927,6 +972,7 @@ define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -971,6 +1017,7 @@ define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1012,6 +1059,7 @@ define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1053,6 +1101,7 @@ define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1097,6 +1146,7 @@ define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1143,6 +1193,7 @@ define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1186,6 +1237,7 @@ define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1230,6 +1282,7 @@ define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1271,6 +1324,7 @@ define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1288,6 +1342,7 @@ define void @masked_load_sext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -1305,6 +1360,7 @@ define void @masked_load_sext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16 ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -1322,6 +1378,7 @@ define void @masked_load_sext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret @@ -1339,6 +1396,7 @@ define void @masked_load_sext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -1356,6 +1414,7 @@ define void @masked_load_sext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret @@ -1373,6 +1432,7 @@ define void @masked_load_sext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret @@ -1390,6 +1450,7 @@ define void @masked_load_zext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0] ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret @@ -1407,6 +1468,7 @@ define void @masked_load_zext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16 ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -1424,6 +1486,7 @@ define void @masked_load_zext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret @@ -1441,6 +1504,7 @@ define void @masked_load_zext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0] ; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret @@ -1458,6 +1522,7 @@ define void @masked_load_zext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret @@ -1475,6 +1540,7 @@ define void @masked_load_zext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(1 ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0] ; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret @@ -1492,7 +1558,8 @@ define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpne p1.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s @@ -1507,6 +1574,7 @@ define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret @@ -1524,7 +1592,8 @@ define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: cmpgt p1.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s @@ -1539,6 +1608,7 @@ define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index ed03f9b322432..1d1e92b2c649f 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -135,14 +135,15 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: uunpklo z1.h, z0.b +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: uunpklo z0.s, z1.h +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i8>, ptr %a @@ -253,9 +254,10 @@ define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i16>, ptr %a @@ -274,9 +276,10 @@ define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i16>, ptr %a @@ -359,6 +362,7 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: and p0.b, p0/z, p0.b, p1.b ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i32>, ptr %a @@ -378,6 +382,7 @@ define void @masked_scatter_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i32>, ptr %a @@ -397,6 +402,7 @@ define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i32>, ptr %a @@ -453,7 +459,8 @@ define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i64>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -468,12 +475,14 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [z1.d] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z2.d, #0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; @@ -483,7 +492,8 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d] +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i64>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -499,7 +509,8 @@ define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i64>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -515,7 +526,8 @@ define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i64>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -629,9 +641,10 @@ define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x half>, ptr %a @@ -650,9 +663,10 @@ define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -736,6 +750,7 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: punpklo p0.h, p0.b +; VBITS_GE_512-NEXT: and p0.b, p0/z, p0.b, p1.b ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x float>, ptr %a @@ -755,6 +770,7 @@ define void @masked_scatter_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x float>, ptr %a @@ -774,6 +790,7 @@ define void @masked_scatter_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a @@ -831,7 +848,8 @@ define void @masked_scatter_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x double>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -846,12 +864,14 @@ define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; VBITS_GE_256-NEXT: fcmeq p0.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [z1.d] +; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z2.d, #0.0 +; VBITS_GE_256-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: ret ; @@ -861,7 +881,8 @@ define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d] +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x double>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -877,7 +898,8 @@ define void @masked_scatter_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x double>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -893,7 +915,8 @@ define void @masked_scatter_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x double>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -915,6 +938,7 @@ define void @masked_scatter_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscal ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -933,7 +957,8 @@ define void @masked_scatter_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscal ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: st1w { z0.s }, p1, [x2, z1.s, sxtw #2] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1w { z0.s }, p0, [x2, z1.s, sxtw #2] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -951,7 +976,8 @@ define void @masked_scatter_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscal ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 -; CHECK-NEXT: st1d { z0.d }, p1, [x2, z1.d, lsl #3] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [x2, z1.d, lsl #3] ; CHECK-NEXT: ret %vals = load <32 x double>, ptr %a %idxs = load <32 x i32>, ptr %b @@ -972,6 +998,7 @@ define void @masked_scatter_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_ra ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -993,6 +1020,7 @@ define void @masked_scatter_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_ ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -1015,6 +1043,7 @@ define void @masked_scatter_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_ ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -1037,6 +1066,7 @@ define void @masked_scatter_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(1 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a @@ -1057,6 +1087,7 @@ define void @masked_scatter_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a @@ -1078,6 +1109,7 @@ define void @masked_scatter_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range( ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a @@ -1099,6 +1131,7 @@ define void @masked_scatter_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d, #4] ; CHECK-NEXT: ret %vals = load <32 x float>, ptr %a @@ -1127,7 +1160,8 @@ define void @masked_scatter_bitcast_infinite_loop(ptr %a, ptr %b, i1 %cond) vsca ; CHECK-NEXT: // %bb.1: // %bb.1 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] ; CHECK-NEXT: .LBB47_2: // %bb.2 ; CHECK-NEXT: ret %vals = load volatile <8 x double>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll index b0d4f79aea110..32d4aa44e6f62 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -71,7 +71,8 @@ define void @masked_store_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %a = load <8 x float>, ptr %ap @@ -91,7 +92,9 @@ define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s -; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s +; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p0.b +; VBITS_GE_256-NEXT: and p0.b, p2/z, p2.b, p0.b ; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z2.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -101,7 +104,8 @@ define void @masked_store_v16f32(ptr %ap, ptr %bp) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret %a = load <16 x float>, ptr %ap @@ -117,7 +121,8 @@ define void @masked_store_v32f32(ptr %ap, ptr %bp) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %a = load <32 x float>, ptr %ap @@ -133,7 +138,8 @@ define void @masked_store_v64f32(ptr %ap, ptr %bp) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %a = load <64 x float>, ptr %ap @@ -173,7 +179,8 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, ptr %ap @@ -217,7 +224,8 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, ptr %ap @@ -258,7 +266,8 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <8 x i64>, ptr %ap @@ -302,7 +311,8 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1b { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, ptr %ap @@ -346,7 +356,8 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <16 x i32>, ptr %ap @@ -387,7 +398,8 @@ define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: and p0.b, p1/z, p1.b, p0.b ; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %a = load <32 x i16>, ptr %ap diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll index 33d5ac4cd299e..0b93039f0bfb2 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-partial-reduce.ll @@ -93,13 +93,15 @@ define <16 x i16> @two_way_i8_i16_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: ldr z0, [x1] ; SVE-NEXT: ldr z1, [x2] ; SVE-NEXT: ptrue p0.h -; SVE-NEXT: ldr z4, [x0] ; SVE-NEXT: uunpklo z2.h, z0.b ; SVE-NEXT: uunpklo z3.h, z1.b ; SVE-NEXT: uunpkhi z0.h, z0.b ; SVE-NEXT: uunpkhi z1.h, z1.b -; SVE-NEXT: mad z2.h, p0/m, z3.h, z4.h -; SVE-NEXT: mad z0.h, p0/m, z1.h, z2.h +; SVE-NEXT: mul z2.h, p0/m, z2.h, z3.h +; SVE-NEXT: ldr z3, [x0] +; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h +; SVE-NEXT: add z1.h, z3.h, z2.h +; SVE-NEXT: add z0.h, z0.h, z1.h ; SVE-NEXT: mov z1.d, z0.d ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -216,13 +218,15 @@ define <8 x i32> @two_way_i16_i32_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: ldr z0, [x1] ; SVE-NEXT: ldr z1, [x2] ; SVE-NEXT: ptrue p0.s -; SVE-NEXT: ldr z4, [x0] ; SVE-NEXT: uunpklo z2.s, z0.h ; SVE-NEXT: uunpklo z3.s, z1.h ; SVE-NEXT: uunpkhi z0.s, z0.h ; SVE-NEXT: uunpkhi z1.s, z1.h -; SVE-NEXT: mad z2.s, p0/m, z3.s, z4.s -; SVE-NEXT: mad z0.s, p0/m, z1.s, z2.s +; SVE-NEXT: mul z2.s, p0/m, z2.s, z3.s +; SVE-NEXT: ldr z3, [x0] +; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s +; SVE-NEXT: add z1.s, z3.s, z2.s +; SVE-NEXT: add z0.s, z0.s, z1.s ; SVE-NEXT: mov z1.d, z0.d ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -339,13 +343,15 @@ define <4 x i64> @two_way_i32_i64_vl256(ptr %accptr, ptr %uptr, ptr %sptr) vscal ; SVE-NEXT: ldr z0, [x1] ; SVE-NEXT: ldr z1, [x2] ; SVE-NEXT: ptrue p0.d -; SVE-NEXT: ldr z4, [x0] ; SVE-NEXT: uunpklo z2.d, z0.s ; SVE-NEXT: uunpklo z3.d, z1.s ; SVE-NEXT: uunpkhi z0.d, z0.s ; SVE-NEXT: uunpkhi z1.d, z1.s -; SVE-NEXT: mad z2.d, p0/m, z3.d, z4.d -; SVE-NEXT: mad z0.d, p0/m, z1.d, z2.d +; SVE-NEXT: mul z2.d, p0/m, z2.d, z3.d +; SVE-NEXT: ldr z3, [x0] +; SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d +; SVE-NEXT: add z1.d, z3.d, z2.d +; SVE-NEXT: add z0.d, z0.d, z1.d ; SVE-NEXT: mov z1.d, z0.d ; SVE-NEXT: ext z1.b, z1.b, z0.b, #16 ; SVE-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -527,22 +533,27 @@ define <2 x i64> @four_way_i16_i64_vl128_usdot(ptr %accptr, ptr %uptr, ptr %sptr ; SME-LABEL: four_way_i16_i64_vl128_usdot: ; SME: // %bb.0: ; SME-NEXT: ptrue p0.d, vl2 -; SME-NEXT: ldr q2, [x0] ; SME-NEXT: mov x8, #2 // =0x2 -; SME-NEXT: ld1h { z0.d }, p0/z, [x1] -; SME-NEXT: ld1sh { z1.d }, p0/z, [x2] -; SME-NEXT: mad z0.d, p0/m, z1.d, z2.d -; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1] -; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1] -; SME-NEXT: mov x8, #4 // =0x4 -; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d -; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1] -; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1] +; SME-NEXT: mov x9, #4 // =0x4 +; SME-NEXT: ldr q6, [x0] +; SME-NEXT: ld1h { z0.d }, p0/z, [x1, x8, lsl #1] +; SME-NEXT: ld1sh { z4.d }, p0/z, [x2, x8, lsl #1] +; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x9, lsl #1] +; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x9, lsl #1] +; SME-NEXT: ld1h { z3.d }, p0/z, [x1] +; SME-NEXT: ld1sh { z5.d }, p0/z, [x2] ; SME-NEXT: mov x8, #6 // =0x6 -; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d -; SME-NEXT: ld1h { z1.d }, p0/z, [x1, x8, lsl #1] -; SME-NEXT: ld1sh { z2.d }, p0/z, [x2, x8, lsl #1] -; SME-NEXT: mla z0.d, p0/m, z2.d, z1.d +; SME-NEXT: mul z0.d, z4.d, z0.d +; SME-NEXT: ld1h { z4.d }, p0/z, [x1, x8, lsl #1] +; SME-NEXT: mul z1.d, z2.d, z1.d +; SME-NEXT: mul z2.d, z5.d, z3.d +; SME-NEXT: ld1sh { z5.d }, p0/z, [x2, x8, lsl #1] +; SME-NEXT: sel z3.d, p0, z6.d, z0.d +; SME-NEXT: add z0.d, z0.d, z1.d +; SME-NEXT: mul z1.d, z5.d, z4.d +; SME-NEXT: add z2.d, z3.d, z2.d +; SME-NEXT: add z0.d, z2.d, z0.d +; SME-NEXT: add z0.d, z0.d, z1.d ; SME-NEXT: // kill: def $q0 killed $q0 killed $z0 ; SME-NEXT: ret %acc = load <2 x i64>, ptr %accptr diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll index 24c5dccd5b420..9a74fa0a21820 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll @@ -216,7 +216,9 @@ define void @trn_v32i8(ptr %a, ptr %b) #0 { ; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.b, z0.b, z1.b ; VBITS_EQ_512-NEXT: trn2 z0.b, z0.b, z1.b -; VBITS_EQ_512-NEXT: add z0.b, z2.b, z0.b +; VBITS_EQ_512-NEXT: sel z1.b, p0, z2.b, z0.b +; VBITS_EQ_512-NEXT: mov z0.b, p0/m, z0.b +; VBITS_EQ_512-NEXT: add z0.b, z1.b, z0.b ; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <32 x i8>, ptr %a @@ -281,7 +283,9 @@ define void @trn_v16i16(ptr %a, ptr %b) #0 { ; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.h, z0.h, z1.h ; VBITS_EQ_512-NEXT: trn2 z0.h, z0.h, z1.h -; VBITS_EQ_512-NEXT: add z0.h, z2.h, z0.h +; VBITS_EQ_512-NEXT: sel z1.h, p0, z2.h, z0.h +; VBITS_EQ_512-NEXT: mov z0.h, p0/m, z0.h +; VBITS_EQ_512-NEXT: add z0.h, z1.h, z0.h ; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <16 x i16>, ptr %a @@ -311,7 +315,9 @@ define void @trn_v8i32(ptr %a, ptr %b) #0 { ; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.s, z0.s, z1.s ; VBITS_EQ_512-NEXT: trn2 z0.s, z0.s, z1.s -; VBITS_EQ_512-NEXT: add z0.s, z2.s, z0.s +; VBITS_EQ_512-NEXT: sel z1.s, p0, z2.s, z0.s +; VBITS_EQ_512-NEXT: mov z0.s, p0/m, z0.s +; VBITS_EQ_512-NEXT: add z0.s, z1.s, z0.s ; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <8 x i32>, ptr %a @@ -389,6 +395,8 @@ define void @trn_v8i32_undef(ptr %a) #0 { ; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_512-NEXT: trn1 z1.s, z0.s, z0.s ; VBITS_EQ_512-NEXT: trn2 z0.s, z0.s, z0.s +; VBITS_EQ_512-NEXT: sel z1.s, p0, z1.s, z0.s +; VBITS_EQ_512-NEXT: mov z0.s, p0/m, z0.s ; VBITS_EQ_512-NEXT: add z0.s, z1.s, z0.s ; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_EQ_512-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll index 23ae5f00b5a45..39658ab527b31 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll @@ -100,10 +100,11 @@ define i1 @ptest_and_v16i1_512bit_sve(ptr %a, ptr %b) vscale_range(4, 4) { ; CHECK-LABEL: ptest_and_v16i1_512bit_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ldr z1, [x1] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: ldr z0, [x1] -; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll index 3e6a7ce34a9ae..28038ddb2a963 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll @@ -426,6 +426,8 @@ define void @no_subvector_binop_hang(ptr %in, ptr %out, i1 %cond) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: .LBB23_2: // %bb.2 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll index 8dc45eadce6f3..6841d0be82906 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll @@ -35,7 +35,8 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 -; VBITS_GE_256-NEXT: add z0.b, z1.b, z1.b +; VBITS_GE_256-NEXT: mov z0.b, p0/m, z1.b +; VBITS_GE_256-NEXT: add z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; @@ -205,7 +206,8 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 -; VBITS_GE_256-NEXT: add z0.h, z1.h, z1.h +; VBITS_GE_256-NEXT: mov z0.h, p0/m, z1.h +; VBITS_GE_256-NEXT: add z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; @@ -459,7 +461,8 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) #0 { ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: add z0.s, z1.s, z1.s +; VBITS_GE_256-NEXT: mov z0.s, p0/m, z1.s +; VBITS_GE_256-NEXT: add z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll b/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll index bcf5063bdda04..ce79ff3d8a965 100644 --- a/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll +++ b/llvm/test/CodeGen/AArch64/sve-indexed-arithmetic.ll @@ -230,10 +230,12 @@ define void @fmla_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 { define void @fmls_indexed_f16_256b(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fmls_indexed_f16_256b: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: ldr z1, [x1] +; CHECK-NEXT: ldr z0, [x1] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: ldr z2, [x2] -; CHECK-NEXT: fmls z2.h, z1.h, z0.h[2] +; CHECK-NEXT: fneg z0.h, p0/m, z0.h +; CHECK-NEXT: fmla z2.h, z0.h, z1.h[2] ; CHECK-NEXT: str z2, [x2] ; CHECK-NEXT: ret %ld.a = load <16 x half>, ptr %a @@ -303,10 +305,12 @@ define void @fmls_indexed_bf16_256b(ptr %a, ptr %b, ptr %c) #0 { define void @fmls_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fmls_indexed_f32_256b: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: ldr z1, [x1] +; CHECK-NEXT: ldr z0, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: ldr z2, [x2] -; CHECK-NEXT: fmls z2.s, z1.s, z0.s[3] +; CHECK-NEXT: fneg z0.s, p0/m, z0.s +; CHECK-NEXT: fmla z2.s, z0.s, z1.s[3] ; CHECK-NEXT: str z2, [x2] ; CHECK-NEXT: ret %ld.a = load <8 x float>, ptr %a @@ -323,10 +327,12 @@ define void @fmls_indexed_f32_256b(ptr %a, ptr %b, ptr %c) #0 { define void @fmls_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fmls_indexed_f64_256b_trn1: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: ldr z1, [x1] +; CHECK-NEXT: ldr z0, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: ldr z2, [x2] -; CHECK-NEXT: fmls z2.d, z1.d, z0.d[0] +; CHECK-NEXT: fneg z0.d, p0/m, z0.d +; CHECK-NEXT: fmla z2.d, z0.d, z1.d[0] ; CHECK-NEXT: str z2, [x2] ; CHECK-NEXT: ret %ld.a = load <4 x double>, ptr %a @@ -342,10 +348,12 @@ define void @fmls_indexed_f64_256b_trn1(ptr %a, ptr %b, ptr %c) #0 { define void @fmls_indexed_f64_256b_trn2(ptr %a, ptr %b, ptr %c) #0 { ; CHECK-LABEL: fmls_indexed_f64_256b_trn2: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: ldr z1, [x1] +; CHECK-NEXT: ldr z0, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z1, [x0] ; CHECK-NEXT: ldr z2, [x2] -; CHECK-NEXT: fmls z2.d, z1.d, z0.d[1] +; CHECK-NEXT: fneg z0.d, p0/m, z0.d +; CHECK-NEXT: fmla z2.d, z0.d, z1.d[1] ; CHECK-NEXT: str z2, [x2] ; CHECK-NEXT: ret %ld.a = load <4 x double>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll index bd49db8a4c414..975d62571cf24 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -12,9 +12,12 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @ctlz_v4i8(<4 x i8> %op) { ; CHECK-LABEL: ctlz_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.h, #255 // =0xff ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: sub z0.h, z0.h, #8 // =0x8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -339,9 +342,12 @@ define void @ctlz_v32i8(ptr %a) { define <2 x i16> @ctlz_v2i16(<2 x i16> %op) { ; CHECK-LABEL: ctlz_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.s, #65535 // =0xffff ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: sub z0.s, z0.s, #16 // =0x10 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -732,9 +738,12 @@ define void @ctlz_v4i64(ptr %a) { define <4 x i8> @ctpop_v4i8(<4 x i8> %op) { ; CHECK-LABEL: ctpop_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.h, #255 // =0xff +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1602,9 +1611,12 @@ define void @ctpop_v32i8(ptr %a) { define <2 x i16> @ctpop_v2i16(<2 x i16> %op) { ; CHECK-LABEL: ctpop_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.s, #65535 // =0xffff +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -2484,9 +2496,11 @@ define void @ctpop_v4i64(ptr %a) { define <4 x i8> @cttz_v4i8(<4 x i8> %op) { ; CHECK-LABEL: cttz_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: orr z0.h, z0.h, #0x100 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: mov z1.h, p0/m, #256 // =0x100 +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -2875,9 +2889,12 @@ define void @cttz_v32i8(ptr %a) { define <2 x i16> @cttz_v2i16(<2 x i16> %op) { ; CHECK-LABEL: cttz_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: orr z0.s, z0.s, #0x10000 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.s, #0x10000 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll index d29e43509dfe9..5a93cf2e5e37a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll @@ -14,22 +14,30 @@ target triple = "aarch64" define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %right_ptr) { ; CHECK-LABEL: fixed_bitselect_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z0.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: ldp q5, q4, [x1] -; CHECK-NEXT: ldp q6, q7, [x2] -; CHECK-NEXT: add z3.s, z1.s, z0.s -; CHECK-NEXT: subr z1.s, z1.s, #0 // =0x0 -; CHECK-NEXT: add z0.s, z2.s, z0.s -; CHECK-NEXT: subr z2.s, z2.s, #0 // =0x0 -; CHECK-NEXT: and z1.d, z1.d, z4.d -; CHECK-NEXT: and z3.d, z3.d, z7.d -; CHECK-NEXT: and z0.d, z0.d, z6.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: orr z1.d, z3.d, z1.d +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: ldp q2, q3, [x1] +; CHECK-NEXT: mov z4.s, p0/m, #0 // =0x0 +; CHECK-NEXT: ldp q5, q6, [x2] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: mov z7.s, p0/m, #-1 // =0xffffffffffffffff +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z3.s, p0, z3.s, z0.s +; CHECK-NEXT: sel z5.s, p0, z5.s, z0.s +; CHECK-NEXT: sel z6.s, p0, z6.s, z0.s +; CHECK-NEXT: sub z16.s, z4.s, z0.s +; CHECK-NEXT: add z0.s, z0.s, z7.s +; CHECK-NEXT: sub z4.s, z4.s, z1.s +; CHECK-NEXT: add z1.s, z1.s, z7.s +; CHECK-NEXT: and z3.d, z16.d, z3.d +; CHECK-NEXT: and z2.d, z4.d, z2.d +; CHECK-NEXT: and z4.d, z0.d, z6.d +; CHECK-NEXT: and z0.d, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z0.d, z2.d -; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 +; CHECK-NEXT: orr z1.d, z4.d, z3.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ec0693a541e44..8b845dff64ffe 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -199,6 +199,13 @@ define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z4.h, z4.h, #0x1 +; CHECK-NEXT: and z5.h, z5.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -434,6 +441,13 @@ define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z4.s, z4.s, #0x1 +; CHECK-NEXT: and z5.s, z5.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -558,6 +572,13 @@ define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: and z5.d, z5.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 1fdcd4f826870..4ffa4081a69a5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -927,13 +927,18 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: udiv_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z2.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sel z2.h, p0, z2.h, z0.h +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: and z1.d, z1.d, z2.d ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -1376,11 +1381,15 @@ define void @udiv_v32i8(ptr %a, ptr %b) { define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: udiv_v2i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.s, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1839,19 +1848,24 @@ define void @udiv_v4i64(ptr %a, ptr %b) { define void @udiv_constantsplat_v8i32(ptr %a) { ; CHECK-LABEL: udiv_constantsplat_v8i32: ; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov w8, #8969 // =0x2309 -; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movk w8, #22765, lsl #16 -; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: umulh z3.s, z1.s, z0.s -; CHECK-NEXT: umulh z0.s, z2.s, z0.s -; CHECK-NEXT: sub z1.s, z1.s, z3.s -; CHECK-NEXT: sub z2.s, z2.s, z0.s -; CHECK-NEXT: usra z3.s, z1.s, #1 -; CHECK-NEXT: usra z0.s, z2.s, #1 -; CHECK-NEXT: lsr z1.s, z3.s, #6 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: umulh z3.s, z0.s, z2.s +; CHECK-NEXT: umulh z2.s, z1.s, z2.s +; CHECK-NEXT: sub z0.s, z0.s, z3.s +; CHECK-NEXT: sub z1.s, z1.s, z2.s +; CHECK-NEXT: lsr z0.s, z0.s, #1 +; CHECK-NEXT: lsr z1.s, z1.s, #1 +; CHECK-NEXT: add z0.s, z0.s, z3.s +; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: lsr z0.s, z0.s, #6 -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: lsr z1.s, z1.s, #6 +; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index 75911e5ff1569..ecfb3dcd3d425 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -174,6 +174,9 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v32i8_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: sunpklo z2.h, z0.b @@ -463,6 +466,9 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v32i8_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: sunpklo z2.h, z0.b @@ -470,9 +476,9 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z3.h, z1.b ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z5.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z6.s, z0.h @@ -482,8 +488,8 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: stp q4, q2, [x1, #64] +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: stp q5, q3, [x1] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] @@ -820,60 +826,63 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v32i8_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: sunpklo z3.h, z1.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.h, z2.b ; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: sunpklo z5.s, z0.h ; CHECK-NEXT: sunpklo z4.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: sunpklo z2.h, z2.b -; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: sunpklo z3.s, z3.h -; CHECK-NEXT: sunpklo z16.d, z4.s -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z6.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z16.d, z4.s +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z5.d, z5.s -; CHECK-NEXT: sunpklo z4.d, z4.s -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z19.d, z3.s ; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 ; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z4.d, z4.s ; CHECK-NEXT: sunpklo z18.d, z6.s ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: sunpklo z19.d, z3.s +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z7.s, z7.h +; CHECK-NEXT: sunpklo z17.d, z17.s ; CHECK-NEXT: sunpklo z20.d, z1.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: stp q16, q4, [x1, #128] ; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: sunpklo z16.d, z0.s -; CHECK-NEXT: sunpklo z17.d, z17.s -; CHECK-NEXT: mov z4.d, z7.d ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 -; CHECK-NEXT: stp q19, q3, [x1, #160] -; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: stp q5, q17, [x1] +; CHECK-NEXT: mov z4.d, z7.d ; CHECK-NEXT: sunpklo z5.d, z6.s ; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q20, q1, [x1, #192] +; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: stp q19, q3, [x1, #160] +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 ; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: sunpklo z1.d, z4.s ; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 ; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: stp q16, q0, [x1, #32] ; CHECK-NEXT: stp q18, q5, [x1, #64] +; CHECK-NEXT: stp q20, q1, [x1, #192] +; CHECK-NEXT: sunpklo z1.d, z4.s +; CHECK-NEXT: stp q16, q0, [x1, #32] ; CHECK-NEXT: sunpklo z3.d, z6.s ; CHECK-NEXT: stp q7, q1, [x1, #224] ; CHECK-NEXT: stp q2, q3, [x1, #96] @@ -1094,6 +1103,9 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v16i16_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: sunpklo z2.s, z0.h @@ -1265,6 +1277,9 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v16i16_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: sunpklo z2.s, z0.h @@ -1272,9 +1287,9 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z4.d, z2.s ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z5.d, z3.s ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z6.d, z0.s @@ -1284,8 +1299,8 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: stp q4, q2, [x1, #64] +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: stp q5, q3, [x1] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] @@ -1407,6 +1422,9 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-LABEL: sext_v8i32_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: sunpklo z2.d, z0.s @@ -1527,6 +1545,9 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v32i8_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: uunpklo z2.h, z0.b @@ -1816,6 +1837,9 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v32i8_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: uunpklo z2.h, z0.b @@ -1823,9 +1847,9 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z3.h, z1.b ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z5.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z6.s, z0.h @@ -1835,8 +1859,8 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: stp q4, q2, [x1, #64] +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: stp q5, q3, [x1] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] @@ -2014,8 +2038,12 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; CHECK-LABEL: zext_v4i8_v4i64: ; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: mov z1.h, #255 // =0xff +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 @@ -2183,60 +2211,63 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v32i8_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z0.b, z0.b -; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z3.h, z1.b +; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.h, z2.b ; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: uunpklo z5.s, z0.h ; CHECK-NEXT: uunpklo z4.s, z3.h ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: uunpklo z2.h, z2.b -; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: uunpklo z3.s, z3.h -; CHECK-NEXT: uunpklo z16.d, z4.s -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z6.s, z2.h ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z16.d, z4.s +; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z5.d, z5.s -; CHECK-NEXT: uunpklo z4.d, z4.s -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z19.d, z3.s ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 ; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z4.d, z4.s ; CHECK-NEXT: uunpklo z18.d, z6.s ; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: uunpklo z19.d, z3.s +; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z7.s, z7.h +; CHECK-NEXT: uunpklo z17.d, z17.s ; CHECK-NEXT: uunpklo z20.d, z1.s ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: stp q16, q4, [x1, #128] ; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpklo z16.d, z0.s -; CHECK-NEXT: uunpklo z17.d, z17.s -; CHECK-NEXT: mov z4.d, z7.d ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 -; CHECK-NEXT: stp q19, q3, [x1, #160] -; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q5, q17, [x1] +; CHECK-NEXT: mov z4.d, z7.d ; CHECK-NEXT: uunpklo z5.d, z6.s ; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q20, q1, [x1, #192] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: stp q19, q3, [x1, #160] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 ; CHECK-NEXT: uunpklo z7.d, z7.s -; CHECK-NEXT: uunpklo z1.d, z4.s ; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 ; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: stp q16, q0, [x1, #32] ; CHECK-NEXT: stp q18, q5, [x1, #64] +; CHECK-NEXT: stp q20, q1, [x1, #192] +; CHECK-NEXT: uunpklo z1.d, z4.s +; CHECK-NEXT: stp q16, q0, [x1, #32] ; CHECK-NEXT: uunpklo z3.d, z6.s ; CHECK-NEXT: stp q7, q1, [x1, #224] ; CHECK-NEXT: stp q2, q3, [x1, #96] @@ -2480,6 +2511,9 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v16i16_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: uunpklo z2.s, z0.h @@ -2657,6 +2691,9 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v16i16_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: add z1.h, z1.h, z1.h ; CHECK-NEXT: uunpklo z2.s, z0.h @@ -2664,9 +2701,9 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z4.d, z2.s ; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z5.d, z3.s ; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z6.d, z0.s @@ -2676,8 +2713,8 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: stp q4, q2, [x1, #64] +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: stp q5, q3, [x1] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] @@ -2811,6 +2848,9 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-LABEL: zext_v8i32_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z0.s ; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: uunpklo z2.d, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll index 94d5bb1543b0e..89c682dfb73b9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll @@ -16,9 +16,10 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { ; NO-FA64: // %bb.0: ; NO-FA64-NEXT: ptrue p0.b, vl8 ; NO-FA64-NEXT: // kill: def $d0 killed $d0 def $z0 -; NO-FA64-NEXT: // kill: def $d2 killed $d2 def $z2 ; NO-FA64-NEXT: // kill: def $d1 killed $d1 def $z1 -; NO-FA64-NEXT: mad z0.b, p0/m, z1.b, z2.b +; NO-FA64-NEXT: // kill: def $d2 killed $d2 def $z2 +; NO-FA64-NEXT: mul z0.b, p0/m, z0.b, z1.b +; NO-FA64-NEXT: add z0.b, z2.b, z0.b ; NO-FA64-NEXT: // kill: def $d0 killed $d0 killed $z0 ; NO-FA64-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index 0c97eedd4362d..53a8322465a97 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -1217,11 +1217,15 @@ define void @smulh_v4i64(ptr %a, ptr %b) { define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; SVE-LABEL: umulh_v4i8: ; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.h, vl4 ; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 -; SVE-NEXT: ptrue p0.h, vl4 -; SVE-NEXT: and z0.h, z0.h, #0xff -; SVE-NEXT: and z1.h, z1.h, #0xff +; SVE-NEXT: mov z2.h, #255 // =0xff +; SVE-NEXT: sel z1.h, p0, z1.h, z0.h +; SVE-NEXT: mov z0.h, p0/m, z0.h +; SVE-NEXT: sel z2.h, p0, z2.h, z0.h +; SVE-NEXT: and z0.d, z0.d, z2.d +; SVE-NEXT: and z1.d, z1.d, z2.d ; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: lsr z0.h, z0.h, #4 ; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -1229,10 +1233,15 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; SVE2-LABEL: umulh_v4i8: ; SVE2: // %bb.0: +; SVE2-NEXT: ptrue p0.h, vl4 ; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 -; SVE2-NEXT: and z0.h, z0.h, #0xff -; SVE2-NEXT: and z1.h, z1.h, #0xff +; SVE2-NEXT: mov z2.h, #255 // =0xff +; SVE2-NEXT: sel z1.h, p0, z1.h, z0.h +; SVE2-NEXT: mov z0.h, p0/m, z0.h +; SVE2-NEXT: sel z2.h, p0, z2.h, z0.h +; SVE2-NEXT: and z0.d, z0.d, z2.d +; SVE2-NEXT: and z1.d, z1.d, z2.d ; SVE2-NEXT: mul z0.h, z0.h, z1.h ; SVE2-NEXT: lsr z0.h, z0.h, #4 ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -1770,11 +1779,15 @@ define void @umulh_v32i8(ptr %a, ptr %b) { define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; SVE-LABEL: umulh_v2i16: ; SVE: // %bb.0: +; SVE-NEXT: ptrue p0.s, vl2 ; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 -; SVE-NEXT: ptrue p0.s, vl2 -; SVE-NEXT: and z0.s, z0.s, #0xffff -; SVE-NEXT: and z1.s, z1.s, #0xffff +; SVE-NEXT: mov z2.s, #65535 // =0xffff +; SVE-NEXT: sel z1.s, p0, z1.s, z0.s +; SVE-NEXT: mov z0.s, p0/m, z0.s +; SVE-NEXT: sel z2.s, p0, z2.s, z0.s +; SVE-NEXT: and z0.d, z0.d, z2.d +; SVE-NEXT: and z1.d, z1.d, z2.d ; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: lsr z0.s, z0.s, #16 ; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -1782,10 +1795,15 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; SVE2-LABEL: umulh_v2i16: ; SVE2: // %bb.0: +; SVE2-NEXT: ptrue p0.s, vl2 ; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 -; SVE2-NEXT: and z0.s, z0.s, #0xffff -; SVE2-NEXT: and z1.s, z1.s, #0xffff +; SVE2-NEXT: mov z2.s, #65535 // =0xffff +; SVE2-NEXT: sel z1.s, p0, z1.s, z0.s +; SVE2-NEXT: mov z0.s, p0/m, z0.s +; SVE2-NEXT: sel z2.s, p0, z2.s, z0.s +; SVE2-NEXT: and z0.d, z0.d, z2.d +; SVE2-NEXT: and z1.d, z1.d, z2.d ; SVE2-NEXT: mul z0.s, z0.s, z1.s ; SVE2-NEXT: lsr z0.s, z0.s, #16 ; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll index 2678324728d0e..6ef8fff8fee65 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -99,6 +99,8 @@ define i8 @uaddv_v32i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: add z0.b, z1.b, z0.b ; CHECK-NEXT: uaddv d0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -245,6 +247,8 @@ define i16 @uaddv_v16i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: uaddv d0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -343,6 +347,8 @@ define i32 @uaddv_v8i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -396,6 +402,8 @@ define i64 @uaddv_v4i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 9497ec88e57b4..cdeb48acf8ccf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -15,14 +15,15 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p1.s, vl4 ; CHECK-NEXT: sxtb z0.h, p0/m, z0.h ; CHECK-NEXT: sxtb z1.h, p0/m, z1.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -78,9 +79,9 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h ; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } -; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, z2.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -177,8 +178,8 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z5.b, z3.b, z3.b ; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, z2.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -350,10 +351,11 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } ; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b +; CHECK-NEXT: mul z1.b, z5.b, z1.b ; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b -; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: mul z2.b, z2.b, z4.b +; CHECK-NEXT: sub z2.b, z3.b, z2.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; @@ -545,9 +547,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -603,8 +605,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -691,11 +693,12 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z17.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h ; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } +; CHECK-NEXT: mul z2.h, z2.h, z4.h +; CHECK-NEXT: sub z2.h, z3.h, z2.h ; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h ; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: msb z2.h, p0/m, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z5.h, z1.h +; CHECK-NEXT: mul z1.h, z5.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; @@ -806,7 +809,8 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, z2.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -839,7 +843,8 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, z2.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -882,9 +887,10 @@ define void @srem_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z0.s ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: msb z0.s, p0/m, z4.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s +; CHECK-NEXT: mul z0.s, z4.s, z0.s +; CHECK-NEXT: mul z3.s, z5.s, z3.s +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: sub z1.s, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; @@ -950,7 +956,8 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, z2.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -978,7 +985,8 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, z2.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1012,9 +1020,10 @@ define void @srem_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z0.d ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: sdiv z5.d, p0/m, z5.d, z3.d -; CHECK-NEXT: msb z0.d, p0/m, z4.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d +; CHECK-NEXT: mul z0.d, z4.d, z0.d +; CHECK-NEXT: mul z3.d, z5.d, z3.d +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: sub z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; @@ -1062,17 +1071,22 @@ define void @srem_v4i64(ptr %a, ptr %b) { define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: urem_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z2.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sel z2.h, p0, z2.h, z0.h +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: and z1.d, z1.d, z2.d ; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -1128,9 +1142,9 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: uzp1 z3.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h ; CHECK-NEXT: splice z2.h, p0, { z3.h, z4.h } -; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, z2.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -1227,8 +1241,8 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z5.b, z3.b, z3.b ; CHECK-NEXT: splice z2.b, p0, { z4.b, z5.b } -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z2.b, z1.b +; CHECK-NEXT: mul z1.b, z2.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1400,10 +1414,11 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: splice z5.b, p0, { z6.b, z7.b } ; CHECK-NEXT: uzp1 z18.b, z16.b, z16.b +; CHECK-NEXT: mul z1.b, z5.b, z1.b ; CHECK-NEXT: splice z2.b, p0, { z17.b, z18.b } -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: mls z0.b, p0/m, z5.b, z1.b -; CHECK-NEXT: msb z2.b, p0/m, z4.b, z3.b +; CHECK-NEXT: sub z0.b, z0.b, z1.b +; CHECK-NEXT: mul z2.b, z2.b, z4.b +; CHECK-NEXT: sub z2.b, z3.b, z2.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; @@ -1595,9 +1610,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -1653,8 +1668,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uzp1 z4.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z5.h, z3.h, z3.h ; CHECK-NEXT: splice z2.h, p0, { z4.h, z5.h } -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h +; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1741,11 +1756,12 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z17.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h ; CHECK-NEXT: splice z2.h, p0, { z16.h, z17.h } +; CHECK-NEXT: mul z2.h, z2.h, z4.h +; CHECK-NEXT: sub z2.h, z3.h, z2.h ; CHECK-NEXT: uzp1 z6.h, z7.h, z7.h ; CHECK-NEXT: splice z5.h, p0, { z5.h, z6.h } -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: msb z2.h, p0/m, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z5.h, z1.h +; CHECK-NEXT: mul z1.h, z5.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret ; @@ -1856,7 +1872,8 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, z2.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -1889,7 +1906,8 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.s, p0/m, z2.s, z1.s -; CHECK-NEXT: mls z0.s, p0/m, z2.s, z1.s +; CHECK-NEXT: mul z1.s, z2.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, z1.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -1932,9 +1950,10 @@ define void @urem_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z0.s ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s -; CHECK-NEXT: msb z0.s, p0/m, z4.s, z1.s -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.s, p0/m, z5.s, z3.s +; CHECK-NEXT: mul z0.s, z4.s, z0.s +; CHECK-NEXT: mul z3.s, z5.s, z3.s +; CHECK-NEXT: sub z0.s, z1.s, z0.s +; CHECK-NEXT: sub z1.s, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; @@ -2000,7 +2019,8 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, z2.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret ; @@ -2028,7 +2048,8 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d -; CHECK-NEXT: mls z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: mul z1.d, z2.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, z1.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret ; @@ -2062,9 +2083,10 @@ define void @urem_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z0.d ; CHECK-NEXT: movprfx z5, z2 ; CHECK-NEXT: udiv z5.d, p0/m, z5.d, z3.d -; CHECK-NEXT: msb z0.d, p0/m, z4.d, z1.d -; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: mls z1.d, p0/m, z5.d, z3.d +; CHECK-NEXT: mul z0.d, z4.d, z0.d +; CHECK-NEXT: mul z3.d, z5.d, z3.d +; CHECK-NEXT: sub z0.d, z1.d, z0.d +; CHECK-NEXT: sub z1.d, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index d0f99211e80fc..5e029d2953e09 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -12,11 +12,14 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: ashr_v4i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #255 // =0xff ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: sel z2.h, p0, z2.h, z0.h ; CHECK-NEXT: sxtb z0.h, p0/m, z0.h +; CHECK-NEXT: and z1.d, z1.d, z2.d ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -350,11 +353,14 @@ define void @ashr_v32i8(ptr %a, ptr %b) { define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: ashr_v2i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.s, #65535 // =0xffff ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: and z1.d, z1.d, z2.d ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -782,11 +788,15 @@ define void @ashr_v4i64(ptr %a, ptr %b) { define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: lshr_v4i8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: and z1.h, z1.h, #0xff -; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: sel z2.h, p0, z2.h, z0.h +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1120,11 +1130,15 @@ define void @lshr_v32i8(ptr %a, ptr %b) { define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: lshr_v2i16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.s, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: and z1.s, z1.s, #0xffff -; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: and z1.d, z1.d, z2.d +; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1552,10 +1566,13 @@ define void @lshr_v4i64(ptr %a, ptr %b) { define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { ; CHECK-LABEL: shl_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z2.s, #255 // =0xff ; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: and z1.s, z1.s, #0xff +; CHECK-NEXT: and z1.d, z1.d, z2.d ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1582,10 +1599,13 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: shl_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: mov z2.h, #255 // =0xff ; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: sel z2.h, p0, z2.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: and z1.d, z1.d, z2.d ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 7df362826d052..91c1f647316a7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -196,9 +196,12 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; CHECK-LABEL: ucvtf_v2i16_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z1.s, #65535 // =0xffff ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -411,9 +414,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) { ; CHECK-LABEL: ucvtf_v2i16_v2f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z1.s, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: mov z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 39701131d7db6..12b7886d76c70 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -293,6 +293,13 @@ define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b +; CHECK-NEXT: mov z4.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: and z4.b, z4.b, #0x1 +; CHECK-NEXT: and z5.b, z5.b, #0x1 +; CHECK-NEXT: cmpne p1.b, p0/z, z4.b, #0 +; CHECK-NEXT: cmpne p0.b, p0/z, z5.b, #0 ; CHECK-NEXT: mov z0.b, p1/m, z1.b ; CHECK-NEXT: sel z1.b, p0, z2.b, z3.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -697,6 +704,13 @@ define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z4.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: and z4.h, z4.h, #0x1 +; CHECK-NEXT: and z5.h, z5.h, #0x1 +; CHECK-NEXT: cmpne p1.h, p0/z, z4.h, #0 +; CHECK-NEXT: cmpne p0.h, p0/z, z5.h, #0 ; CHECK-NEXT: mov z0.h, p1/m, z1.h ; CHECK-NEXT: sel z1.h, p0, z2.h, z3.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -911,6 +925,13 @@ define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: and z4.s, z4.s, #0x1 +; CHECK-NEXT: and z5.s, z5.s, #0x1 +; CHECK-NEXT: cmpne p1.s, p0/z, z4.s, #0 +; CHECK-NEXT: cmpne p0.s, p0/z, z5.s, #0 ; CHECK-NEXT: mov z0.s, p1/m, z1.s ; CHECK-NEXT: sel z1.s, p0, z2.s, z3.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -1044,6 +1065,13 @@ define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: and z4.d, z4.d, #0x1 +; CHECK-NEXT: and z5.d, z5.d, #0x1 +; CHECK-NEXT: cmpne p1.d, p0/z, z4.d, #0 +; CHECK-NEXT: cmpne p0.d, p0/z, z5.d, #0 ; CHECK-NEXT: mov z0.d, p1/m, z1.d ; CHECK-NEXT: sel z1.d, p0, z2.d, z3.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll index d4565c4b69c77..93f14483de996 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll @@ -127,6 +127,8 @@ define i8 @andv_v32i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -295,6 +297,8 @@ define i16 @andv_v16i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -393,6 +397,8 @@ define i32 @andv_v8i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -446,6 +452,8 @@ define i64 @andv_v4i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -588,6 +596,8 @@ define i8 @eorv_v32i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -756,6 +766,8 @@ define i16 @eorv_v16i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -854,6 +866,8 @@ define i32 @eorv_v8i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -907,6 +921,8 @@ define i64 @eorv_v4i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -1049,6 +1065,8 @@ define i8 @orv_v32i8(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -1217,6 +1235,8 @@ define i16 @orv_v16i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -1315,6 +1335,8 @@ define i32 @orv_v8i32(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -1368,6 +1390,8 @@ define i64 @orv_v4i64(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q1, q0, [x0] ; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov z0.d, p0/m, z0.d +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll index b1ac9469c0573..12987ac32cf23 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll @@ -13,8 +13,9 @@ define <2 x i64> @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2, 2) { ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: index z1.d, #1, #1 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: index z0.d, #1, #1 +; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: ldr q1, [x1] @@ -106,6 +107,7 @@ define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2, 2) { ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: index z1.d, #1, #1 ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: sel z1.d, p0, z1.d, z0.d ; CHECK-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: and z1.d, z2.d, z1.d ; CHECK-NEXT: uaddv d2, p0, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 48a642c908bfe..31e155fc8fb1c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -2669,11 +2669,15 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-NEXT: fmov s0, w2 ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: adrp x8, .LCPI13_0 -; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fmov s2, w3 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: zip1 z0.h, z1.h, z0.h -; CHECK-NEXT: fmov s1, w3 +; CHECK-NEXT: zip1 z1.h, z2.h, z0.h ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI13_0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 @@ -2742,11 +2746,15 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; CHECK-NEXT: fmov s0, w2 ; CHECK-NEXT: fmov s1, w1 ; CHECK-NEXT: adrp x8, .LCPI14_0 -; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: fmov s2, w3 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: zip1 z0.h, z1.h, z0.h -; CHECK-NEXT: fmov s1, w3 +; CHECK-NEXT: zip1 z1.h, z2.h, z0.h ; CHECK-NEXT: zip1 z0.s, z0.s, z1.s ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI14_0] +; CHECK-NEXT: mov z0.h, p0/m, z0.h +; CHECK-NEXT: sel z1.h, p0, z1.h, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: lsl z0.h, z0.h, #15 ; CHECK-NEXT: asr z0.h, z0.h, #15 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index 90466e3cebd5e..a4223da0052e6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -122,46 +122,48 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1, #32] ; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q2, q3, [x1, #32] ; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: ldp q6, q1, [x1] -; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmne p4.s, p0/z, z5.s, #0.0 ; CHECK-NEXT: fcmne p7.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmne p6.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: fcmne p0.s, p0/z, z6.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 z17.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z19.h, z2.h, z2.h +; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 z16.h, z1.h, z1.h -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z19.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z18.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z0.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.h, z7.h, z7.h ; CHECK-NEXT: splice z4.h, p0, { z16.h, z17.h } -; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } ; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } ; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b ; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b ; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: splice z1.b, p0, { z4.b, z5.b } ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 @@ -331,46 +333,48 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q2, q3, [x1, #32] ; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q2, q3, [x1, #32] ; CHECK-NEXT: fcmne p1.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: ldp q6, q1, [x1] -; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmne p4.s, p0/z, z5.s, #0.0 ; CHECK-NEXT: fcmne p7.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmne p6.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: fcmne p0.s, p0/z, z6.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 z17.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z19.h, z2.h, z2.h +; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 z16.h, z1.h, z1.h -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z3.h, z3.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z19.h, z2.h, z2.h +; CHECK-NEXT: uzp1 z0.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z18.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z0.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.h, z7.h, z7.h ; CHECK-NEXT: splice z4.h, p0, { z16.h, z17.h } -; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } ; CHECK-NEXT: splice z0.h, p0, { z0.h, z1.h } +; CHECK-NEXT: splice z5.h, p0, { z18.h, z19.h } ; CHECK-NEXT: splice z1.h, p0, { z2.h, z3.h } ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z4.b, z4.b -; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b ; CHECK-NEXT: uzp1 z2.b, z0.b, z0.b +; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b ; CHECK-NEXT: uzp1 z4.b, z1.b, z1.b ; CHECK-NEXT: splice z0.b, p0, { z2.b, z3.b } ; CHECK-NEXT: splice z1.b, p0, { z4.b, z5.b } ; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mov z0.b, p0/m, z0.b +; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b ; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll index 16d26e442d306..509225954a0e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll +++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll @@ -24,6 +24,8 @@ define void @func_vscale_none(ptr %a, ptr %b) #0 { ; CHECK-ARG-NEXT: ptrue p0.s, vl16 ; CHECK-ARG-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-ARG-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-ARG-NEXT: mov z0.s, p0/m, z0.s +; CHECK-ARG-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-ARG-NEXT: add z0.s, z0.s, z1.s ; CHECK-ARG-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-ARG-NEXT: ret @@ -89,6 +91,10 @@ define void @func_vscale2_4(ptr %a, ptr %b) #3 { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s +; CHECK-NEXT: sel z2.s, p0, z2.s, z0.s +; CHECK-NEXT: sel z3.s, p0, z3.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: add z1.s, z2.s, z3.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -126,6 +132,8 @@ define void @func_vscale8_8(ptr %a, ptr %b) #5 { ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] +; CHECK-NEXT: mov z0.s, p0/m, z0.s +; CHECK-NEXT: sel z1.s, p0, z1.s, z0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/ARM/2009-11-02-NegativeLane.ll b/llvm/test/CodeGen/ARM/2009-11-02-NegativeLane.ll index ff182aeb9cbf5..74165746e68bb 100644 --- a/llvm/test/CodeGen/ARM/2009-11-02-NegativeLane.ll +++ b/llvm/test/CodeGen/ARM/2009-11-02-NegativeLane.ll @@ -1,13 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mcpu=cortex-a8 -arm-atomic-cfg-tidy=0 < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" target triple = "armv7-eabi" define arm_aapcs_vfpcc void @foo(ptr nocapture %pBuffer, i32 %numItems) nounwind { +; CHECK-LABEL: foo: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: bxne lr +; CHECK-NEXT: .LBB0_1: @ %bb.preheader +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vdup.16 q8, r0 +; CHECK-NEXT: vmov.16 d16[2], r0 +; CHECK-NEXT: vmul.i16 q8, q8, q8 +; CHECK-NEXT: vst1.16 {d16[2]}, [r0:16] +; CHECK-NEXT: bx lr entry: br i1 undef, label %return, label %bb bb: ; preds = %bb, %entry -; CHECK: vld1.16 {d16[], d17[]} %0 = load i16, ptr undef, align 2 %1 = insertelement <8 x i16> undef, i16 %0, i32 2 %2 = insertelement <8 x i16> %1, i16 undef, i32 3 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll index b5387ae98e35f..7f84914fca10c 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll @@ -6,32 +6,38 @@ define <32 x i32> @fred(i32 %a0) #0 { ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { -; CHECK-NEXT: r3:2 = combine(#20,#9) -; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: r7 = #64 ; CHECK-NEXT: r1 = #24 -; CHECK-NEXT: r4 = #12 +; CHECK-NEXT: r3:2 = combine(#12,#9) +; CHECK-NEXT: v0 = vxor(v0,v0) +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: r4 = #20 +; CHECK-NEXT: r5 = #7 +; CHECK-NEXT: } +; CHECK-NEXT: { +; CHECK-NEXT: v0 = valign(v0,v0,r7) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1 = vror(v0,r1) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vinsert(r2) -; CHECK-NEXT: r4 = #7 -; CHECK-NEXT: r2 = #116 -; CHECK-NEXT: v0 = vror(v0,r4) +; CHECK-NEXT: r3 = #116 +; CHECK-NEXT: v0 = vror(v0,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v0.w = vinsert(r4) +; CHECK-NEXT: v0.w = vinsert(r5) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vror(v1,r3) +; CHECK-NEXT: v1 = vror(v1,r4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v1.w = vinsert(r0) -; CHECK-NEXT: v0 = vror(v0,r2) +; CHECK-NEXT: v0 = vror(v0,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: v1 = vror(v1,r3) +; CHECK-NEXT: v1 = vror(v1,r4) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: v0 = vor(v0,v1) diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll index cecdd77a079e4..a59551b43490f 100644 --- a/llvm/test/CodeGen/RISCV/double_reduct.ll +++ b/llvm/test/CodeGen/RISCV/double_reduct.ll @@ -88,11 +88,12 @@ define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) { define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: add_ext_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v10, v8 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v12, v9 +; CHECK-NEXT: vadd.vv v8, v10, v12 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %ae = zext <16 x i8> %a to <16 x i16> @@ -106,14 +107,16 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: add_ext_v32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma -; CHECK-NEXT: vmv.s.x v11, zero -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vwredsumu.vs v10, v10, v11 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v10 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vredsum.vs v8, v16, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vredsum.vs v8, v12, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %ae = zext <32 x i8> %a to <32 x i16> diff --git a/llvm/test/CodeGen/RISCV/pr94265.ll b/llvm/test/CodeGen/RISCV/pr94265.ll index f92cdb4ca7395..25e1ac847baf4 100644 --- a/llvm/test/CodeGen/RISCV/pr94265.ll +++ b/llvm/test/CodeGen/RISCV/pr94265.ll @@ -11,8 +11,9 @@ define <8 x i16> @PR94265(<8 x i32> %a0) #0 { ; RV32I-NEXT: vsra.vi v10, v8, 31 ; RV32I-NEXT: vsrl.vi v10, v10, 26 ; RV32I-NEXT: vadd.vv v8, v8, v10 +; RV32I-NEXT: vsrl.vi v8, v8, 6 ; RV32I-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32I-NEXT: vnsrl.wi v10, v8, 6 +; RV32I-NEXT: vnsrl.wi v10, v8, 0 ; RV32I-NEXT: vsll.vi v8, v10, 10 ; RV32I-NEXT: ret ; @@ -22,8 +23,9 @@ define <8 x i16> @PR94265(<8 x i32> %a0) #0 { ; RV64I-NEXT: vsra.vi v10, v8, 31 ; RV64I-NEXT: vsrl.vi v10, v10, 26 ; RV64I-NEXT: vadd.vv v8, v8, v10 +; RV64I-NEXT: vsrl.vi v8, v8, 6 ; RV64I-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV64I-NEXT: vnsrl.wi v10, v8, 6 +; RV64I-NEXT: vnsrl.wi v10, v8, 0 ; RV64I-NEXT: vsll.vi v8, v10, 10 ; RV64I-NEXT: ret %t1 = sdiv <8 x i32> %a0, diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll index 1acc830347de4..5a2a130c894a9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll @@ -106,13 +106,13 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) { ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vid.v v8 ; CHECK-NEXT: li a0, 16 -; CHECK-NEXT: vadd.vx v16, v8, a0 +; CHECK-NEXT: vsaddu.vx v16, v8, a1 +; CHECK-NEXT: vadd.vx v8, v8, a0 +; CHECK-NEXT: vmsltu.vx v0, v16, a2 ; CHECK-NEXT: vsaddu.vx v8, v8, a1 -; CHECK-NEXT: vsaddu.vx v16, v16, a1 -; CHECK-NEXT: vmsltu.vx v24, v16, a2 -; CHECK-NEXT: vmsltu.vx v0, v8, a2 +; CHECK-NEXT: vmsltu.vx v16, v8, a2 ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vslideup.vi v0, v24, 2 +; CHECK-NEXT: vslideup.vi v0, v16, 2 ; CHECK-NEXT: ret %mask = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 %index, i64 %tc) ret <32 x i1> %mask diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll index 7c9a283dd54bc..a3d41a751143e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -486,10 +486,9 @@ define @extract_nxv6f16_nxv12f16_6( %in) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v13, v10, a0 -; CHECK-NEXT: vslidedown.vx v12, v9, a0 -; CHECK-NEXT: vslideup.vx v12, v10, a0 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vslidedown.vx v8, v9, a0 +; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vslidedown.vx v9, v10, a0 ; CHECK-NEXT: ret %res = call @llvm.vector.extract.nxv6f16.nxv12f16( %in, i64 6) ret %res @@ -539,10 +538,9 @@ define @extract_nxv6bf16_nxv12bf16_6( @llvm.vector.extract.nxv6bf16.nxv12bf16( %in, i64 6) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll index 7a337aa253805..9c2aa42447247 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-i8-index-cornercase.ll @@ -20,8 +20,8 @@ define <512 x i8> @single_source(<512 x i8> %a) { ; CHECK-NEXT: vmv8r.v v16, v8 ; CHECK-NEXT: li a0, 512 ; CHECK-NEXT: addi a1, sp, 512 -; CHECK-NEXT: vmv.x.s a2, v16 ; CHECK-NEXT: vslidedown.vi v24, v16, 5 +; CHECK-NEXT: vmv.x.s a2, v16 ; CHECK-NEXT: li a3, 432 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vse8.v v8, (a1) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll index 3d83065009f28..eee048084268e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse-vp.ll @@ -16,17 +16,17 @@ define <2 x i8> @vp_bitreverse_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <2 x i8> @llvm.vp.bitreverse.v2i8(<2 x i8> %va, <2 x i1> %m, i32 %evl) ret <2 x i8> %v @@ -42,17 +42,17 @@ define <2 x i8> @vp_bitreverse_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <2 x i8> @llvm.vp.bitreverse.v2i8(<2 x i8> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x i8> %v @@ -70,17 +70,17 @@ define <4 x i8> @vp_bitreverse_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <4 x i8> @llvm.vp.bitreverse.v4i8(<4 x i8> %va, <4 x i1> %m, i32 %evl) ret <4 x i8> %v @@ -96,17 +96,17 @@ define <4 x i8> @vp_bitreverse_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <4 x i8> @llvm.vp.bitreverse.v4i8(<4 x i8> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x i8> %v @@ -124,17 +124,17 @@ define <8 x i8> @vp_bitreverse_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i8> @llvm.vp.bitreverse.v8i8(<8 x i8> %va, <8 x i1> %m, i32 %evl) ret <8 x i8> %v @@ -150,17 +150,17 @@ define <8 x i8> @vp_bitreverse_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) { ; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <8 x i8> @llvm.vp.bitreverse.v8i8(<8 x i8> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x i8> %v @@ -178,17 +178,17 @@ define <16 x i8> @vp_bitreverse_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext % ; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vi v8, v8, 15, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <16 x i8> @llvm.vp.bitreverse.v16i8(<16 x i8> %va, <16 x i1> %m, i32 %evl) ret <16 x i8> %v @@ -204,17 +204,17 @@ define <16 x i8> @vp_bitreverse_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) ; CHECK-NEXT: vsll.vi v9, v9, 4 ; CHECK-NEXT: vand.vi v8, v8, 15 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: li a0, 85 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <16 x i8> @llvm.vp.bitreverse.v16i8(<16 x i8> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x i8> %v @@ -231,25 +231,25 @@ define <2 x i16> @vp_bitreverse_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <2 x i16> @llvm.vp.bitreverse.v2i16(<2 x i16> %va, <2 x i1> %m, i32 %evl) ret <2 x i16> %v @@ -264,25 +264,25 @@ define <2 x i16> @vp_bitreverse_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <2 x i16> @llvm.vp.bitreverse.v2i16(<2 x i16> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x i16> %v @@ -299,25 +299,25 @@ define <4 x i16> @vp_bitreverse_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <4 x i16> @llvm.vp.bitreverse.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl) ret <4 x i16> %v @@ -332,25 +332,25 @@ define <4 x i16> @vp_bitreverse_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <4 x i16> @llvm.vp.bitreverse.v4i16(<4 x i16> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x i16> %v @@ -367,25 +367,25 @@ define <8 x i16> @vp_bitreverse_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <8 x i16> @llvm.vp.bitreverse.v8i16(<8 x i16> %va, <8 x i1> %m, i32 %evl) ret <8 x i16> %v @@ -400,25 +400,25 @@ define <8 x i16> @vp_bitreverse_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <8 x i16> @llvm.vp.bitreverse.v8i16(<8 x i16> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x i16> %v @@ -435,25 +435,25 @@ define <16 x i16> @vp_bitreverse_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroex ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -241 ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v10, v10, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v10, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v10, v10, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v10, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v10, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: ret %v = call <16 x i16> @llvm.vp.bitreverse.v16i16(<16 x i16> %va, <16 x i1> %m, i32 %evl) ret <16 x i16> %v @@ -468,25 +468,25 @@ define <16 x i16> @vp_bitreverse_v16i16_unmasked(<16 x i16> %va, i32 zeroext %ev ; CHECK-NEXT: lui a0, 1 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: addi a0, a0, -241 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: lui a0, 3 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 2 +; CHECK-NEXT: vsll.vi v10, v10, 4 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vsll.vi v10, v10, 2 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v10, v10, v10 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret %v = call <16 x i16> @llvm.vp.bitreverse.v16i16(<16 x i16> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x i16> %v @@ -498,12 +498,12 @@ define <2 x i32> @vp_bitreverse_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %e ; CHECK-LABEL: vp_bitreverse_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t -; CHECK-NEXT: vor.vv v9, v9, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v9, v10, v9, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 @@ -511,25 +511,25 @@ define <2 x i32> @vp_bitreverse_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <2 x i32> @llvm.vp.bitreverse.v2i32(<2 x i32> %va, <2 x i1> %m, i32 %evl) ret <2 x i32> %v @@ -539,12 +539,12 @@ define <2 x i32> @vp_bitreverse_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) ; CHECK-LABEL: vp_bitreverse_v2i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsrl.vi v9, v8, 24 +; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vor.vv v9, v9, v10 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vor.vv v9, v10, v9 ; CHECK-NEXT: vsll.vi v10, v8, 24 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 @@ -552,25 +552,25 @@ define <2 x i32> @vp_bitreverse_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <2 x i32> @llvm.vp.bitreverse.v2i32(<2 x i32> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x i32> %v @@ -582,12 +582,12 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e ; CHECK-LABEL: vp_bitreverse_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t -; CHECK-NEXT: vor.vv v9, v9, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v9, v10, v9, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 @@ -595,25 +595,25 @@ define <4 x i32> @vp_bitreverse_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 4, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 2, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t -; CHECK-NEXT: vsrl.vi v9, v8, 1, v0.t -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t +; CHECK-NEXT: vand.vx v9, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v9, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret %v = call <4 x i32> @llvm.vp.bitreverse.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl) ret <4 x i32> %v @@ -623,12 +623,12 @@ define <4 x i32> @vp_bitreverse_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) ; CHECK-LABEL: vp_bitreverse_v4i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsrl.vi v9, v8, 24 +; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vor.vv v9, v9, v10 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vor.vv v9, v10, v9 ; CHECK-NEXT: vsll.vi v10, v8, 24 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 @@ -636,25 +636,25 @@ define <4 x i32> @vp_bitreverse_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %v = call <4 x i32> @llvm.vp.bitreverse.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x i32> %v @@ -666,12 +666,12 @@ define <8 x i32> @vp_bitreverse_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %e ; CHECK-LABEL: vp_bitreverse_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 24, v0.t -; CHECK-NEXT: vor.vv v10, v10, v12, v0.t +; CHECK-NEXT: vand.vx v12, v12, a0, v0.t +; CHECK-NEXT: vor.vv v10, v12, v10, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 @@ -679,25 +679,25 @@ define <8 x i32> @vp_bitreverse_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %e ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 4, v0.t -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v10, v10, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v10, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 2, v0.t -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v10, v10, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v10, v8, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 1, v0.t -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t +; CHECK-NEXT: vand.vx v10, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v10, v10, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v10, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: ret %v = call <8 x i32> @llvm.vp.bitreverse.v8i32(<8 x i32> %va, <8 x i1> %m, i32 %evl) ret <8 x i32> %v @@ -707,12 +707,12 @@ define <8 x i32> @vp_bitreverse_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) ; CHECK-LABEL: vp_bitreverse_v8i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsrl.vi v10, v8, 8 +; CHECK-NEXT: vsrl.vi v10, v8, 24 +; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vor.vv v10, v10, v12 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vor.vv v10, v12, v10 ; CHECK-NEXT: vsll.vi v12, v8, 24 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 @@ -720,25 +720,25 @@ define <8 x i32> @vp_bitreverse_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v12, v8 ; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 2 +; CHECK-NEXT: vsll.vi v10, v10, 4 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v10, v10, a0 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vsll.vi v10, v10, 2 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v10, v10, v10 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret %v = call <8 x i32> @llvm.vp.bitreverse.v8i32(<8 x i32> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x i32> %v @@ -750,12 +750,12 @@ define <16 x i32> @vp_bitreverse_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroex ; CHECK-LABEL: vp_bitreverse_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsrl.vi v12, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v12, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t -; CHECK-NEXT: vor.vv v12, v12, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t +; CHECK-NEXT: vor.vv v12, v16, v12, v0.t ; CHECK-NEXT: vand.vx v16, v8, a0, v0.t ; CHECK-NEXT: lui a0, 61681 ; CHECK-NEXT: addi a0, a0, -241 @@ -763,25 +763,25 @@ define <16 x i32> @vp_bitreverse_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroex ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 4, v0.t -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t +; CHECK-NEXT: vand.vx v12, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v12, v12, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v12, v8, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 2, v0.t -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: vand.vx v12, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v12, v12, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v12, v8, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 1, v0.t -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t +; CHECK-NEXT: vand.vx v12, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v12, v12, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v12, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret %v = call <16 x i32> @llvm.vp.bitreverse.v16i32(<16 x i32> %va, <16 x i1> %m, i32 %evl) ret <16 x i32> %v @@ -791,12 +791,12 @@ define <16 x i32> @vp_bitreverse_v16i32_unmasked(<16 x i32> %va, i32 zeroext %ev ; CHECK-LABEL: vp_bitreverse_v16i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsrl.vi v12, v8, 8 +; CHECK-NEXT: vsrl.vi v12, v8, 24 +; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vor.vv v12, v12, v16 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vor.vv v12, v16, v12 ; CHECK-NEXT: vsll.vi v16, v8, 24 ; CHECK-NEXT: vand.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 61681 @@ -804,25 +804,25 @@ define <16 x i32> @vp_bitreverse_v16i32_unmasked(<16 x i32> %va, i32 zeroext %ev ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v16, v8 ; CHECK-NEXT: vor.vv v8, v8, v12 -; CHECK-NEXT: vsrl.vi v12, v8, 4 +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: lui a0, 209715 ; CHECK-NEXT: addi a0, a0, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v12, v8 -; CHECK-NEXT: vsrl.vi v12, v8, 2 +; CHECK-NEXT: vsll.vi v12, v12, 4 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v12, v12, a0 ; CHECK-NEXT: lui a0, 349525 ; CHECK-NEXT: addi a0, a0, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v12, v8 -; CHECK-NEXT: vsrl.vi v12, v8, 1 +; CHECK-NEXT: vsll.vi v12, v12, 2 +; CHECK-NEXT: vor.vv v8, v8, v12 +; CHECK-NEXT: vand.vx v12, v8, a0 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v12, v12, v12 ; CHECK-NEXT: vand.vx v8, v8, a0 -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v12, v8 +; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: ret %v = call <16 x i32> @llvm.vp.bitreverse.v16i32(<16 x i32> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x i32> %v @@ -883,21 +883,21 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV32-NEXT: vmv.v.x v11, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV32-NEXT: vand.vv v10, v10, v9, v0.t +; RV32-NEXT: vand.vv v10, v8, v9, v0.t +; RV32-NEXT: vsll.vi v10, v10, 4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t ; RV32-NEXT: vand.vv v8, v8, v9, v0.t -; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV32-NEXT: vand.vv v9, v9, v12, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v9, v8, v12, v0.t +; RV32-NEXT: vsll.vi v9, v9, 2, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v8, v8, v12, v0.t -; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v9, v8, v0.t -; RV32-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV32-NEXT: vand.vv v9, v9, v11, v0.t +; RV32-NEXT: vor.vv v8, v8, v9, v0.t +; RV32-NEXT: vand.vv v9, v8, v11, v0.t +; RV32-NEXT: vsll.vi v9, v9, 1, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: vand.vv v8, v8, v11, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v9, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v9, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -913,14 +913,14 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV64-NEXT: lui a7, 349525 ; RV64-NEXT: addi a5, a5, -241 ; RV64-NEXT: addi a6, a6, 819 -; RV64-NEXT: addi a7, a7, 1365 -; RV64-NEXT: slli t0, a5, 32 -; RV64-NEXT: add t0, a5, t0 +; RV64-NEXT: addi t0, a7, 1365 +; RV64-NEXT: slli a7, a5, 32 +; RV64-NEXT: add a7, a5, a7 ; RV64-NEXT: slli a5, a6, 32 ; RV64-NEXT: add a6, a6, a5 -; RV64-NEXT: slli a5, a7, 32 -; RV64-NEXT: add a5, a7, a5 -; RV64-NEXT: li a7, 40 +; RV64-NEXT: slli a5, t0, 32 +; RV64-NEXT: add a5, t0, a5 +; RV64-NEXT: li t0, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV64-NEXT: vand.vx v9, v8, a1, v0.t ; RV64-NEXT: slli a3, a3, 24 @@ -931,11 +931,11 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV64-NEXT: vor.vv v9, v9, v10, v0.t ; RV64-NEXT: vsll.vx v10, v8, a2, v0.t ; RV64-NEXT: vand.vx v11, v8, a0, v0.t -; RV64-NEXT: vsll.vx v11, v11, a7, v0.t +; RV64-NEXT: vsll.vx v11, v11, t0, v0.t ; RV64-NEXT: vor.vv v10, v10, v11, v0.t ; RV64-NEXT: vor.vv v9, v10, v9, v0.t ; RV64-NEXT: vsrl.vx v10, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v11, v8, a7, v0.t +; RV64-NEXT: vsrl.vx v11, v8, t0, v0.t ; RV64-NEXT: vand.vx v11, v11, a0, v0.t ; RV64-NEXT: vor.vv v10, v11, v10, v0.t ; RV64-NEXT: vsrl.vi v11, v8, 24, v0.t @@ -945,21 +945,21 @@ define <2 x i64> @vp_bitreverse_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %e ; RV64-NEXT: vor.vv v8, v8, v11, v0.t ; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: vor.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 4, v0.t -; RV64-NEXT: vand.vx v9, v9, t0, v0.t -; RV64-NEXT: vand.vx v8, v8, t0, v0.t -; RV64-NEXT: vsll.vi v8, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 2, v0.t -; RV64-NEXT: vand.vx v9, v9, a6, v0.t +; RV64-NEXT: vand.vx v9, v8, a7, v0.t +; RV64-NEXT: vsll.vi v9, v9, 4, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 4, v0.t +; RV64-NEXT: vand.vx v8, v8, a7, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a6, v0.t +; RV64-NEXT: vsll.vi v9, v9, 2, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV64-NEXT: vand.vx v8, v8, a6, v0.t -; RV64-NEXT: vsll.vi v8, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v9, v8, v0.t -; RV64-NEXT: vsrl.vi v9, v8, 1, v0.t -; RV64-NEXT: vand.vx v9, v9, a5, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t +; RV64-NEXT: vand.vx v9, v8, a5, v0.t +; RV64-NEXT: vsll.vi v9, v9, 1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV64-NEXT: vand.vx v8, v8, a5, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v9, v8, v0.t +; RV64-NEXT: vor.vv v8, v8, v9, v0.t ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.bitreverse.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl) ret <2 x i64> %v @@ -1019,21 +1019,21 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: vmv.v.x v10, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v9 -; RV32-NEXT: vsrl.vi v9, v8, 4 +; RV32-NEXT: vand.vv v9, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 4 +; RV32-NEXT: vsll.vi v9, v9, 4 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v9, v9, v12 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v11 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vsll.vi v9, v9, 2 ; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vand.vv v9, v9, v11 -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v9 +; RV32-NEXT: vand.vv v9, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: vadd.vv v9, v9, v9 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vand.vv v9, v9, v10 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -1081,21 +1081,21 @@ define <2 x i64> @vp_bitreverse_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) ; RV64-NEXT: vor.vv v9, v9, v11 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vand.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v9, v9, 4 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vand.vx v9, v9, a0 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v9, v9, 2 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v9, v9, a1 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v9, v9, v9 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v9, v9, a2 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: ret %v = call <2 x i64> @llvm.vp.bitreverse.v2i64(<2 x i64> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x i64> %v @@ -1156,21 +1156,21 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV32-NEXT: vmv.v.x v14, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV32-NEXT: vand.vv v12, v12, v10, v0.t +; RV32-NEXT: vand.vv v12, v8, v10, v0.t +; RV32-NEXT: vsll.vi v12, v12, 4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t ; RV32-NEXT: vand.vv v8, v8, v10, v0.t -; RV32-NEXT: vsll.vi v8, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v12, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV32-NEXT: vand.vv v10, v10, v16, v0.t +; RV32-NEXT: vor.vv v8, v8, v12, v0.t +; RV32-NEXT: vand.vv v10, v8, v16, v0.t +; RV32-NEXT: vsll.vi v10, v10, 2, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: vsll.vi v8, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t -; RV32-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV32-NEXT: vand.vv v10, v10, v14, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t +; RV32-NEXT: vand.vv v10, v8, v14, v0.t +; RV32-NEXT: vsll.vi v10, v10, 1, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV32-NEXT: vand.vv v8, v8, v14, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v10, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v10, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -1186,14 +1186,14 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV64-NEXT: lui a7, 349525 ; RV64-NEXT: addi a5, a5, -241 ; RV64-NEXT: addi a6, a6, 819 -; RV64-NEXT: addi a7, a7, 1365 -; RV64-NEXT: slli t0, a5, 32 -; RV64-NEXT: add t0, a5, t0 +; RV64-NEXT: addi t0, a7, 1365 +; RV64-NEXT: slli a7, a5, 32 +; RV64-NEXT: add a7, a5, a7 ; RV64-NEXT: slli a5, a6, 32 ; RV64-NEXT: add a6, a6, a5 -; RV64-NEXT: slli a5, a7, 32 -; RV64-NEXT: add a5, a7, a5 -; RV64-NEXT: li a7, 40 +; RV64-NEXT: slli a5, t0, 32 +; RV64-NEXT: add a5, t0, a5 +; RV64-NEXT: li t0, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV64-NEXT: vand.vx v10, v8, a1, v0.t ; RV64-NEXT: slli a3, a3, 24 @@ -1204,11 +1204,11 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV64-NEXT: vor.vv v10, v10, v12, v0.t ; RV64-NEXT: vsll.vx v12, v8, a2, v0.t ; RV64-NEXT: vand.vx v14, v8, a0, v0.t -; RV64-NEXT: vsll.vx v14, v14, a7, v0.t +; RV64-NEXT: vsll.vx v14, v14, t0, v0.t ; RV64-NEXT: vor.vv v12, v12, v14, v0.t ; RV64-NEXT: vor.vv v10, v12, v10, v0.t ; RV64-NEXT: vsrl.vx v12, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v14, v8, a7, v0.t +; RV64-NEXT: vsrl.vx v14, v8, t0, v0.t ; RV64-NEXT: vand.vx v14, v14, a0, v0.t ; RV64-NEXT: vor.vv v12, v14, v12, v0.t ; RV64-NEXT: vsrl.vi v14, v8, 24, v0.t @@ -1218,21 +1218,21 @@ define <4 x i64> @vp_bitreverse_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %e ; RV64-NEXT: vor.vv v8, v8, v14, v0.t ; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: vor.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 4, v0.t -; RV64-NEXT: vand.vx v10, v10, t0, v0.t -; RV64-NEXT: vand.vx v8, v8, t0, v0.t -; RV64-NEXT: vsll.vi v8, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 2, v0.t -; RV64-NEXT: vand.vx v10, v10, a6, v0.t +; RV64-NEXT: vand.vx v10, v8, a7, v0.t +; RV64-NEXT: vsll.vi v10, v10, 4, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 4, v0.t +; RV64-NEXT: vand.vx v8, v8, a7, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a6, v0.t +; RV64-NEXT: vsll.vi v10, v10, 2, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV64-NEXT: vand.vx v8, v8, a6, v0.t -; RV64-NEXT: vsll.vi v8, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v10, v8, v0.t -; RV64-NEXT: vsrl.vi v10, v8, 1, v0.t -; RV64-NEXT: vand.vx v10, v10, a5, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t +; RV64-NEXT: vand.vx v10, v8, a5, v0.t +; RV64-NEXT: vsll.vi v10, v10, 1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV64-NEXT: vand.vx v8, v8, a5, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v10, v8, v0.t +; RV64-NEXT: vor.vv v8, v8, v10, v0.t ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.bitreverse.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl) ret <4 x i64> %v @@ -1292,21 +1292,21 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: vmv.v.x v12, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vand.vv v10, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 4 +; RV32-NEXT: vsll.vi v10, v10, 4 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vand.vv v10, v10, v16 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v14 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vsll.vi v10, v10, 2 ; RV32-NEXT: vand.vv v8, v8, v14 -; RV32-NEXT: vand.vv v10, v10, v14 -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -1354,21 +1354,21 @@ define <4 x i64> @vp_bitreverse_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) ; RV64-NEXT: vor.vv v10, v12, v10 ; RV64-NEXT: vor.vv v8, v8, v14 ; RV64-NEXT: vor.vv v8, v8, v10 -; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vand.vx v10, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v10, v10, 4 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vand.vx v10, v10, a0 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v10, v10, 2 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v10, v10, a1 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v10, v10, v10 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v10, v10, a2 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: ret %v = call <4 x i64> @llvm.vp.bitreverse.v4i64(<4 x i64> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x i64> %v @@ -1429,21 +1429,21 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vor.vv v16, v16, v20, v0.t -; RV32-NEXT: vsrl.vi v20, v16, 4, v0.t -; RV32-NEXT: vand.vv v20, v20, v28, v0.t +; RV32-NEXT: vand.vv v20, v16, v28, v0.t +; RV32-NEXT: vsll.vi v20, v20, 4, v0.t +; RV32-NEXT: vsrl.vi v16, v16, 4, v0.t ; RV32-NEXT: vand.vv v16, v16, v28, v0.t -; RV32-NEXT: vsll.vi v16, v16, 4, v0.t -; RV32-NEXT: vor.vv v16, v20, v16, v0.t -; RV32-NEXT: vsrl.vi v20, v16, 2, v0.t -; RV32-NEXT: vand.vv v20, v20, v12, v0.t +; RV32-NEXT: vor.vv v16, v16, v20, v0.t +; RV32-NEXT: vand.vv v20, v16, v12, v0.t +; RV32-NEXT: vsll.vi v20, v20, 2, v0.t +; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t ; RV32-NEXT: vand.vv v12, v16, v12, v0.t -; RV32-NEXT: vsll.vi v12, v12, 2, v0.t -; RV32-NEXT: vor.vv v12, v20, v12, v0.t -; RV32-NEXT: vsrl.vi v16, v12, 1, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vor.vv v12, v12, v20, v0.t +; RV32-NEXT: vand.vv v16, v12, v8, v0.t +; RV32-NEXT: vsll.vi v16, v16, 1, v0.t +; RV32-NEXT: vsrl.vi v12, v12, 1, v0.t ; RV32-NEXT: vand.vv v8, v12, v8, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -1459,14 +1459,14 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV64-NEXT: lui a7, 349525 ; RV64-NEXT: addi a5, a5, -241 ; RV64-NEXT: addi a6, a6, 819 -; RV64-NEXT: addi a7, a7, 1365 -; RV64-NEXT: slli t0, a5, 32 -; RV64-NEXT: add t0, a5, t0 +; RV64-NEXT: addi t0, a7, 1365 +; RV64-NEXT: slli a7, a5, 32 +; RV64-NEXT: add a7, a5, a7 ; RV64-NEXT: slli a5, a6, 32 ; RV64-NEXT: add a6, a6, a5 -; RV64-NEXT: slli a5, a7, 32 -; RV64-NEXT: add a5, a7, a5 -; RV64-NEXT: li a7, 40 +; RV64-NEXT: slli a5, t0, 32 +; RV64-NEXT: add a5, t0, a5 +; RV64-NEXT: li t0, 40 ; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV64-NEXT: vand.vx v12, v8, a1, v0.t ; RV64-NEXT: slli a3, a3, 24 @@ -1477,11 +1477,11 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV64-NEXT: vor.vv v12, v12, v16, v0.t ; RV64-NEXT: vsll.vx v16, v8, a2, v0.t ; RV64-NEXT: vand.vx v20, v8, a0, v0.t -; RV64-NEXT: vsll.vx v20, v20, a7, v0.t +; RV64-NEXT: vsll.vx v20, v20, t0, v0.t ; RV64-NEXT: vor.vv v16, v16, v20, v0.t ; RV64-NEXT: vor.vv v12, v16, v12, v0.t ; RV64-NEXT: vsrl.vx v16, v8, a2, v0.t -; RV64-NEXT: vsrl.vx v20, v8, a7, v0.t +; RV64-NEXT: vsrl.vx v20, v8, t0, v0.t ; RV64-NEXT: vand.vx v20, v20, a0, v0.t ; RV64-NEXT: vor.vv v16, v20, v16, v0.t ; RV64-NEXT: vsrl.vi v20, v8, 24, v0.t @@ -1491,21 +1491,21 @@ define <8 x i64> @vp_bitreverse_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %e ; RV64-NEXT: vor.vv v8, v8, v20, v0.t ; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: vor.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 4, v0.t -; RV64-NEXT: vand.vx v12, v12, t0, v0.t -; RV64-NEXT: vand.vx v8, v8, t0, v0.t -; RV64-NEXT: vsll.vi v8, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 2, v0.t -; RV64-NEXT: vand.vx v12, v12, a6, v0.t +; RV64-NEXT: vand.vx v12, v8, a7, v0.t +; RV64-NEXT: vsll.vi v12, v12, 4, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 4, v0.t +; RV64-NEXT: vand.vx v8, v8, a7, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a6, v0.t +; RV64-NEXT: vsll.vi v12, v12, 2, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV64-NEXT: vand.vx v8, v8, a6, v0.t -; RV64-NEXT: vsll.vi v8, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v12, v8, v0.t -; RV64-NEXT: vsrl.vi v12, v8, 1, v0.t -; RV64-NEXT: vand.vx v12, v12, a5, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t +; RV64-NEXT: vand.vx v12, v8, a5, v0.t +; RV64-NEXT: vsll.vi v12, v12, 1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV64-NEXT: vand.vx v8, v8, a5, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v12, v8, v0.t +; RV64-NEXT: vor.vv v8, v8, v12, v0.t ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.bitreverse.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl) ret <8 x i64> %v @@ -1565,21 +1565,21 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; RV32-NEXT: vor.vv v8, v8, v12 -; RV32-NEXT: vsrl.vi v12, v8, 4 +; RV32-NEXT: vand.vv v12, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 4 +; RV32-NEXT: vsll.vi v12, v12, 4 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v12, v12, v24 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v20 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vsll.vi v12, v12, 2 ; RV32-NEXT: vand.vv v8, v8, v20 -; RV32-NEXT: vand.vv v12, v12, v20 -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v12, v8 -; RV32-NEXT: vsrl.vi v12, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vand.vv v12, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: vadd.vv v12, v12, v12 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vand.vv v12, v12, v16 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: vor.vv v8, v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret @@ -1627,21 +1627,21 @@ define <8 x i64> @vp_bitreverse_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) ; RV64-NEXT: vor.vv v12, v16, v12 ; RV64-NEXT: vor.vv v8, v8, v20 ; RV64-NEXT: vor.vv v8, v8, v12 -; RV64-NEXT: vsrl.vi v12, v8, 4 +; RV64-NEXT: vand.vx v12, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v12, v12, 4 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vand.vx v12, v12, a0 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vand.vx v12, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v12, v12, 2 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v12, v12, a1 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v12, v8 -; RV64-NEXT: vsrl.vi v12, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v12 +; RV64-NEXT: vand.vx v12, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v12, v12, v12 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v12, v12, a2 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v12, v8 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret %v = call <8 x i64> @llvm.vp.bitreverse.v8i64(<8 x i64> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x i64> %v @@ -1733,35 +1733,35 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: addi a3, a3, 1365 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vsll.vi v24, v24, 4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v24, v24, 4, v0.t ; RV32-NEXT: vor.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsll.vi v16, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t ; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v24, v24, 2, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV32-NEXT: vand.vv v24, v24, v8, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: vsll.vi v24, v24, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t ; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 @@ -1826,21 +1826,21 @@ define <15 x i64> @vp_bitreverse_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroex ; RV64-NEXT: addi a3, sp, 16 ; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v16, 4, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 4, v0.t ; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vsll.vi v8, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v16, 2, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vsll.vi v8, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a2, v0.t +; RV64-NEXT: vsll.vi v16, v16, 1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV64-NEXT: vand.vx v8, v8, a2, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 @@ -1918,27 +1918,27 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 4 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vsll.vi v16, v16, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vsll.vi v16, v16, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 1 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vadd.vv v16, v16, v16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2000,21 +2000,21 @@ define <15 x i64> @vp_bitreverse_v15i64_unmasked(<15 x i64> %va, i32 zeroext %ev ; RV64-NEXT: vor.vv v16, v24, v16 ; RV64-NEXT: vor.vv v8, v8, v0 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v16, v16, 4 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v16, v16, 2 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v16, v16, a1 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v16, v16, v16 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v16, v16, a2 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 @@ -2112,35 +2112,35 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: addi a3, a3, 1365 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a1 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vor.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vand.vv v24, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vand.vv v24, v8, v16, v0.t +; RV32-NEXT: vsll.vi v24, v24, 4, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 4, v0.t +; RV32-NEXT: vand.vv v16, v8, v16, v0.t ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v24, v24, 4, v0.t ; RV32-NEXT: vor.vv v24, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v16, v24, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v8, v0.t +; RV32-NEXT: vand.vv v16, v24, v8, v0.t +; RV32-NEXT: vsll.vi v16, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v24, v24, 2, v0.t ; RV32-NEXT: vand.vv v24, v24, v8, v0.t ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v24, v24, 2, v0.t -; RV32-NEXT: vor.vv v16, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t -; RV32-NEXT: vand.vv v24, v24, v8, v0.t +; RV32-NEXT: vor.vv v16, v24, v16, v0.t +; RV32-NEXT: vand.vv v24, v16, v8, v0.t +; RV32-NEXT: vsll.vi v24, v24, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v16, 1, v0.t ; RV32-NEXT: vand.vv v8, v16, v8, v0.t -; RV32-NEXT: vsll.vi v8, v8, 1, v0.t -; RV32-NEXT: vor.vv v8, v24, v8, v0.t +; RV32-NEXT: vor.vv v8, v8, v24, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 @@ -2205,21 +2205,21 @@ define <16 x i64> @vp_bitreverse_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroex ; RV64-NEXT: addi a3, sp, 16 ; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV64-NEXT: vand.vx v16, v16, a0, v0.t +; RV64-NEXT: vand.vx v16, v8, a0, v0.t +; RV64-NEXT: vsll.vi v16, v16, 4, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 4, v0.t ; RV64-NEXT: vand.vx v8, v8, a0, v0.t -; RV64-NEXT: vsll.vi v8, v8, 4, v0.t -; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 2, v0.t -; RV64-NEXT: vand.vx v16, v16, a1, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a1, v0.t +; RV64-NEXT: vsll.vi v16, v16, 2, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 2, v0.t ; RV64-NEXT: vand.vx v8, v8, a1, v0.t -; RV64-NEXT: vsll.vi v8, v8, 2, v0.t -; RV64-NEXT: vor.vv v8, v16, v8, v0.t -; RV64-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV64-NEXT: vand.vx v16, v16, a2, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t +; RV64-NEXT: vand.vx v16, v8, a2, v0.t +; RV64-NEXT: vsll.vi v16, v16, 1, v0.t +; RV64-NEXT: vsrl.vi v8, v8, 1, v0.t ; RV64-NEXT: vand.vx v8, v8, a2, v0.t -; RV64-NEXT: vsll.vi v8, v8, 1, v0.t -; RV64-NEXT: vor.vv v8, v16, v8, v0.t +; RV64-NEXT: vor.vv v8, v8, v16, v0.t ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 @@ -2297,27 +2297,27 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV32-NEXT: vmv.v.x v24, a1 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 4 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 4 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 2 +; RV32-NEXT: vsll.vi v16, v16, 4 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v16, v8 -; RV32-NEXT: vsrl.vi v16, v8, 1 +; RV32-NEXT: vsll.vi v16, v16, 2 +; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vand.vv v16, v8, v24 +; RV32-NEXT: vsrl.vi v8, v8, 1 ; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v16, v8 +; RV32-NEXT: vadd.vv v16, v16, v16 +; RV32-NEXT: vor.vv v8, v8, v16 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -2379,21 +2379,21 @@ define <16 x i64> @vp_bitreverse_v16i64_unmasked(<16 x i64> %va, i32 zeroext %ev ; RV64-NEXT: vor.vv v16, v24, v16 ; RV64-NEXT: vor.vv v8, v8, v0 ; RV64-NEXT: vor.vv v8, v8, v16 -; RV64-NEXT: vsrl.vi v16, v8, 4 +; RV64-NEXT: vand.vx v16, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v16, v16, 4 ; RV64-NEXT: vand.vx v8, v8, a0 -; RV64-NEXT: vand.vx v16, v16, a0 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v16, v16, 2 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v16, v16, a1 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v16, v8 -; RV64-NEXT: vsrl.vi v16, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v16 +; RV64-NEXT: vand.vx v16, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v16, v16, v16 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v16, v16, a2 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v16, v8 +; RV64-NEXT: vor.vv v8, v8, v16 ; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: slli a0, a0, 3 ; RV64-NEXT: add sp, sp, a0 @@ -2428,45 +2428,45 @@ define <128 x i16> @vp_bitreverse_v128i16(<128 x i16> %va, <128 x i1> %m, i32 ze ; CHECK-NEXT: and a3, a0, a3 ; CHECK-NEXT: lui a0, 5 ; CHECK-NEXT: vsll.vi v8, v8, 8, v0.t -; CHECK-NEXT: addi a1, a1, -241 -; CHECK-NEXT: addi a2, a2, 819 +; CHECK-NEXT: addi a4, a1, -241 +; CHECK-NEXT: addi a1, a2, 819 ; CHECK-NEXT: addi a0, a0, 1365 ; CHECK-NEXT: vor.vv v8, v8, v24, v0.t -; CHECK-NEXT: vsrl.vi v24, v8, 4, v0.t -; CHECK-NEXT: vand.vx v24, v24, a1, v0.t +; CHECK-NEXT: vand.vx v24, v8, a4, v0.t +; CHECK-NEXT: vsll.vi v24, v24, 4, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 4, v0.t +; CHECK-NEXT: vand.vx v8, v8, a4, v0.t +; CHECK-NEXT: vor.vv v8, v8, v24, v0.t +; CHECK-NEXT: vand.vx v24, v8, a1, v0.t +; CHECK-NEXT: vsll.vi v24, v24, 2, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 2, v0.t ; CHECK-NEXT: vand.vx v8, v8, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 4, v0.t -; CHECK-NEXT: vor.vv v8, v24, v8, v0.t -; CHECK-NEXT: vsrl.vi v24, v8, 2, v0.t -; CHECK-NEXT: vand.vx v24, v24, a2, v0.t -; CHECK-NEXT: vand.vx v8, v8, a2, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 2, v0.t -; CHECK-NEXT: vor.vv v8, v24, v8, v0.t -; CHECK-NEXT: vsrl.vi v24, v8, 1, v0.t -; CHECK-NEXT: vand.vx v24, v24, a0, v0.t +; CHECK-NEXT: vor.vv v8, v8, v24, v0.t +; CHECK-NEXT: vand.vx v24, v8, a0, v0.t +; CHECK-NEXT: vsll.vi v24, v24, 1, v0.t +; CHECK-NEXT: vsrl.vi v8, v8, 1, v0.t ; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t -; CHECK-NEXT: vor.vv v8, v24, v8, v0.t +; CHECK-NEXT: vor.vv v8, v8, v24, v0.t ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vsetvli zero, a3, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v16, 8, v0.t ; CHECK-NEXT: vsll.vi v16, v16, 8, v0.t ; CHECK-NEXT: vor.vv v16, v16, v24, v0.t -; CHECK-NEXT: vsrl.vi v24, v16, 4, v0.t -; CHECK-NEXT: vand.vx v24, v24, a1, v0.t +; CHECK-NEXT: vand.vx v24, v16, a4, v0.t +; CHECK-NEXT: vsll.vi v24, v24, 4, v0.t +; CHECK-NEXT: vsrl.vi v16, v16, 4, v0.t +; CHECK-NEXT: vand.vx v16, v16, a4, v0.t +; CHECK-NEXT: vor.vv v16, v16, v24, v0.t +; CHECK-NEXT: vand.vx v24, v16, a1, v0.t +; CHECK-NEXT: vsll.vi v24, v24, 2, v0.t +; CHECK-NEXT: vsrl.vi v16, v16, 2, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsll.vi v16, v16, 4, v0.t -; CHECK-NEXT: vor.vv v16, v24, v16, v0.t -; CHECK-NEXT: vsrl.vi v24, v16, 2, v0.t -; CHECK-NEXT: vand.vx v24, v24, a2, v0.t -; CHECK-NEXT: vand.vx v16, v16, a2, v0.t -; CHECK-NEXT: vsll.vi v16, v16, 2, v0.t -; CHECK-NEXT: vor.vv v16, v24, v16, v0.t -; CHECK-NEXT: vsrl.vi v24, v16, 1, v0.t -; CHECK-NEXT: vand.vx v24, v24, a0, v0.t +; CHECK-NEXT: vor.vv v16, v16, v24, v0.t +; CHECK-NEXT: vand.vx v24, v16, a0, v0.t +; CHECK-NEXT: vsll.vi v24, v24, 1, v0.t +; CHECK-NEXT: vsrl.vi v16, v16, 1, v0.t ; CHECK-NEXT: vand.vx v16, v16, a0, v0.t -; CHECK-NEXT: vsll.vi v16, v16, 1, v0.t -; CHECK-NEXT: vor.vv v16, v24, v16, v0.t +; CHECK-NEXT: vor.vv v16, v16, v24, v0.t ; CHECK-NEXT: ret %v = call <128 x i16> @llvm.vp.bitreverse.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl) ret <128 x i16> %v @@ -2495,44 +2495,44 @@ define <128 x i16> @vp_bitreverse_v128i16_unmasked(<128 x i16> %va, i32 zeroext ; CHECK-NEXT: addi a2, a2, -241 ; CHECK-NEXT: addi a3, a3, 819 ; CHECK-NEXT: addi a4, a4, 1365 -; CHECK-NEXT: vsrl.vi v24, v8, 4 +; CHECK-NEXT: vand.vx v24, v8, a2 +; CHECK-NEXT: vsrl.vi v8, v8, 4 +; CHECK-NEXT: vsll.vi v24, v24, 4 ; CHECK-NEXT: vand.vx v8, v8, a2 -; CHECK-NEXT: vand.vx v24, v24, a2 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vor.vv v8, v8, v24 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vsrl.vi v24, v16, 8 ; CHECK-NEXT: vsll.vi v16, v16, 8 ; CHECK-NEXT: vor.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vsrl.vi v24, v8, 2 +; CHECK-NEXT: vand.vx v24, v8, a3 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vsll.vi v24, v24, 2 ; CHECK-NEXT: vand.vx v8, v8, a3 -; CHECK-NEXT: vand.vx v24, v24, a3 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vor.vv v8, v8, v24 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vsrl.vi v24, v16, 4 +; CHECK-NEXT: vand.vx v24, v16, a2 +; CHECK-NEXT: vsrl.vi v16, v16, 4 +; CHECK-NEXT: vsll.vi v24, v24, 4 ; CHECK-NEXT: vand.vx v16, v16, a2 -; CHECK-NEXT: vand.vx v24, v24, a2 -; CHECK-NEXT: vsll.vi v16, v16, 4 -; CHECK-NEXT: vor.vv v16, v24, v16 +; CHECK-NEXT: vor.vv v16, v16, v24 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vsrl.vi v24, v8, 1 +; CHECK-NEXT: vand.vx v24, v8, a4 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v24, v24, v24 ; CHECK-NEXT: vand.vx v8, v8, a4 -; CHECK-NEXT: vand.vx v24, v24, a4 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v24, v8 +; CHECK-NEXT: vor.vv v8, v8, v24 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vsrl.vi v24, v16, 2 +; CHECK-NEXT: vand.vx v24, v16, a3 +; CHECK-NEXT: vsrl.vi v16, v16, 2 +; CHECK-NEXT: vsll.vi v24, v24, 2 ; CHECK-NEXT: vand.vx v16, v16, a3 -; CHECK-NEXT: vand.vx v24, v24, a3 -; CHECK-NEXT: vsll.vi v16, v16, 2 -; CHECK-NEXT: vor.vv v16, v24, v16 -; CHECK-NEXT: vsrl.vi v24, v16, 1 +; CHECK-NEXT: vor.vv v16, v16, v24 +; CHECK-NEXT: vand.vx v24, v16, a4 +; CHECK-NEXT: vsrl.vi v16, v16, 1 +; CHECK-NEXT: vadd.vv v24, v24, v24 ; CHECK-NEXT: vand.vx v16, v16, a4 -; CHECK-NEXT: vand.vx v24, v24, a4 -; CHECK-NEXT: vadd.vv v16, v16, v16 -; CHECK-NEXT: vor.vv v16, v24, v16 +; CHECK-NEXT: vor.vv v16, v16, v24 ; CHECK-NEXT: ret %v = call <128 x i16> @llvm.vp.bitreverse.v128i16(<128 x i16> %va, <128 x i1> splat (i1 true), i32 %evl) ret <128 x i16> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index 6d9793c12153e..050724110b7fd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -14,25 +14,25 @@ define void @bitreverse_v8i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsrl.vi v9, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: lui a1, 3 ; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: lui a1, 5 ; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret ; @@ -69,25 +69,25 @@ define void @bitreverse_v4i32(ptr %x, ptr %y) { ; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vor.vv v8, v8, v9 -; CHECK-NEXT: vsrl.vi v9, v8, 4 +; CHECK-NEXT: vand.vx v9, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: lui a1, 209715 ; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 2 +; CHECK-NEXT: vsll.vi v9, v9, 4 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a1 ; CHECK-NEXT: lui a1, 349525 ; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v9, v8 -; CHECK-NEXT: vsrl.vi v9, v8, 1 +; CHECK-NEXT: vsll.vi v9, v9, 2 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vand.vx v9, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v9, v9, v9 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v9, v9, a1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v9, v8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret ; @@ -113,67 +113,67 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) { ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a2, 1044480 -; RV32-NEXT: li a3, 56 -; RV32-NEXT: li a4, 40 -; RV32-NEXT: lui a5, 16 -; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: lui a4, 16 +; RV32-NEXT: lui a5, 4080 ; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: addi a2, a5, -256 +; RV32-NEXT: addi a1, a4, -256 ; RV32-NEXT: vlse64.v v9, (a6), zero -; RV32-NEXT: vsrl.vx v10, v8, a3 -; RV32-NEXT: vsrl.vx v11, v8, a4 +; RV32-NEXT: vsrl.vx v10, v8, a2 +; RV32-NEXT: vsrl.vx v11, v8, a3 ; RV32-NEXT: vsrl.vi v12, v8, 24 -; RV32-NEXT: vsll.vx v13, v8, a3 -; RV32-NEXT: vand.vx v11, v11, a2 +; RV32-NEXT: vsll.vx v13, v8, a2 +; RV32-NEXT: vand.vx v11, v11, a1 ; RV32-NEXT: vor.vv v10, v11, v10 -; RV32-NEXT: vand.vx v11, v8, a2 -; RV32-NEXT: vsll.vx v11, v11, a4 +; RV32-NEXT: vand.vx v11, v8, a1 +; RV32-NEXT: vsll.vx v11, v11, a3 ; RV32-NEXT: vor.vv v11, v13, v11 ; RV32-NEXT: vsrl.vi v13, v8, 8 -; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vand.vx v12, v12, a5 ; RV32-NEXT: vand.vv v13, v13, v9 ; RV32-NEXT: vor.vv v12, v13, v12 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 ; RV32-NEXT: vor.vv v10, v12, v10 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v12, a2 +; RV32-NEXT: vmv.v.x v12, a1 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vand.vv v9, v8, v9 -; RV32-NEXT: vand.vx v8, v8, a1 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v9, v9, 8 ; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v9, a3 +; RV32-NEXT: vmv.v.x v9, a2 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v11, v8 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v11, a4 +; RV32-NEXT: vmv.v.x v11, a3 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vor.vv v8, v8, v10 -; RV32-NEXT: vsrl.vi v10, v8, 4 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 4 +; RV32-NEXT: vsll.vi v10, v10, 4 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v10, v8, v9 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vsll.vi v10, v10, 2 ; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vand.vv v9, v10, v9 -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v9, v8 -; RV32-NEXT: vsrl.vi v9, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vand.vv v9, v8, v11 +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: vadd.vv v9, v9, v9 ; RV32-NEXT: vand.vv v8, v8, v11 -; RV32-NEXT: vand.vv v9, v9, v11 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v9, v8 +; RV32-NEXT: vor.vv v8, v8, v9 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -223,21 +223,21 @@ define void @bitreverse_v2i64(ptr %x, ptr %y) { ; RV64-NEXT: vor.vv v9, v11, v9 ; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 -; RV64-NEXT: vsrl.vi v9, v8, 4 +; RV64-NEXT: vand.vx v9, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v9, v9, 4 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v9, v9, a1 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v9, v9, 2 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v9, v9, a2 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v9, v8 -; RV64-NEXT: vsrl.vi v9, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vand.vx v9, v8, a3 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v9, v9, v9 ; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vand.vx v9, v9, a3 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret ; @@ -266,25 +266,25 @@ define void @bitreverse_v16i16(ptr %x, ptr %y) { ; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: vsll.vi v8, v8, 8 ; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: lui a1, 3 ; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 2 +; CHECK-NEXT: vsll.vi v10, v10, 4 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: lui a1, 5 ; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vsll.vi v10, v10, 2 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v10, v10, v10 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v10, v10, a1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret ; @@ -321,25 +321,25 @@ define void @bitreverse_v8i32(ptr %x, ptr %y) { ; CHECK-NEXT: vsll.vi v12, v12, 8 ; CHECK-NEXT: vor.vv v8, v8, v12 ; CHECK-NEXT: vor.vv v8, v8, v10 -; CHECK-NEXT: vsrl.vi v10, v8, 4 +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 4 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: lui a1, 209715 ; CHECK-NEXT: addi a1, a1, 819 -; CHECK-NEXT: vsll.vi v8, v8, 4 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 2 +; CHECK-NEXT: vsll.vi v10, v10, 4 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v10, v10, a1 ; CHECK-NEXT: lui a1, 349525 ; CHECK-NEXT: addi a1, a1, 1365 -; CHECK-NEXT: vsll.vi v8, v8, 2 -; CHECK-NEXT: vor.vv v8, v10, v8 -; CHECK-NEXT: vsrl.vi v10, v8, 1 +; CHECK-NEXT: vsll.vi v10, v10, 2 +; CHECK-NEXT: vor.vv v8, v8, v10 +; CHECK-NEXT: vand.vx v10, v8, a1 +; CHECK-NEXT: vsrl.vi v8, v8, 1 +; CHECK-NEXT: vadd.vv v10, v10, v10 ; CHECK-NEXT: vand.vx v8, v8, a1 -; CHECK-NEXT: vand.vx v10, v10, a1 -; CHECK-NEXT: vadd.vv v8, v8, v8 -; CHECK-NEXT: vor.vv v8, v10, v8 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret ; @@ -365,67 +365,67 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a2, 1044480 -; RV32-NEXT: li a3, 56 -; RV32-NEXT: li a4, 40 -; RV32-NEXT: lui a5, 16 -; RV32-NEXT: lui a1, 4080 +; RV32-NEXT: lui a1, 1044480 +; RV32-NEXT: li a2, 56 +; RV32-NEXT: li a3, 40 +; RV32-NEXT: lui a4, 16 +; RV32-NEXT: lui a5, 4080 ; RV32-NEXT: addi a6, sp, 8 -; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) -; RV32-NEXT: addi a2, a5, -256 +; RV32-NEXT: addi a1, a4, -256 ; RV32-NEXT: vlse64.v v10, (a6), zero -; RV32-NEXT: vsrl.vx v12, v8, a3 -; RV32-NEXT: vsrl.vx v14, v8, a4 +; RV32-NEXT: vsrl.vx v12, v8, a2 +; RV32-NEXT: vsrl.vx v14, v8, a3 ; RV32-NEXT: vsrl.vi v16, v8, 24 -; RV32-NEXT: vsll.vx v18, v8, a3 -; RV32-NEXT: vand.vx v14, v14, a2 +; RV32-NEXT: vsll.vx v18, v8, a2 +; RV32-NEXT: vand.vx v14, v14, a1 ; RV32-NEXT: vor.vv v14, v14, v12 -; RV32-NEXT: vand.vx v12, v8, a2 -; RV32-NEXT: vsll.vx v12, v12, a4 +; RV32-NEXT: vand.vx v12, v8, a1 +; RV32-NEXT: vsll.vx v12, v12, a3 ; RV32-NEXT: vor.vv v12, v18, v12 ; RV32-NEXT: vsrl.vi v18, v8, 8 -; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vand.vx v16, v16, a5 ; RV32-NEXT: vand.vv v18, v18, v10 ; RV32-NEXT: vor.vv v16, v18, v16 -; RV32-NEXT: lui a2, 61681 -; RV32-NEXT: lui a3, 209715 -; RV32-NEXT: lui a4, 349525 -; RV32-NEXT: addi a2, a2, -241 -; RV32-NEXT: addi a3, a3, 819 -; RV32-NEXT: addi a4, a4, 1365 +; RV32-NEXT: lui a1, 61681 +; RV32-NEXT: lui a2, 209715 +; RV32-NEXT: lui a3, 349525 +; RV32-NEXT: addi a1, a1, -241 +; RV32-NEXT: addi a2, a2, 819 +; RV32-NEXT: addi a3, a3, 1365 ; RV32-NEXT: vor.vv v14, v16, v14 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: vmv.v.x v16, a1 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vand.vv v10, v8, v10 -; RV32-NEXT: vand.vx v8, v8, a1 +; RV32-NEXT: vand.vx v8, v8, a5 ; RV32-NEXT: vsll.vi v8, v8, 24 ; RV32-NEXT: vsll.vi v10, v10, 8 ; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v10, a3 +; RV32-NEXT: vmv.v.x v10, a2 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a4 +; RV32-NEXT: vmv.v.x v12, a3 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vor.vv v8, v8, v14 -; RV32-NEXT: vsrl.vi v14, v8, 4 +; RV32-NEXT: vand.vv v14, v8, v16 +; RV32-NEXT: vsrl.vi v8, v8, 4 +; RV32-NEXT: vsll.vi v14, v14, 4 ; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vand.vv v14, v14, v16 -; RV32-NEXT: vsll.vi v8, v8, 4 -; RV32-NEXT: vor.vv v8, v14, v8 -; RV32-NEXT: vsrl.vi v14, v8, 2 +; RV32-NEXT: vor.vv v8, v8, v14 +; RV32-NEXT: vand.vv v14, v8, v10 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vsll.vi v14, v14, 2 ; RV32-NEXT: vand.vv v8, v8, v10 -; RV32-NEXT: vand.vv v10, v14, v10 -; RV32-NEXT: vsll.vi v8, v8, 2 -; RV32-NEXT: vor.vv v8, v10, v8 -; RV32-NEXT: vsrl.vi v10, v8, 1 +; RV32-NEXT: vor.vv v8, v8, v14 +; RV32-NEXT: vand.vv v10, v8, v12 +; RV32-NEXT: vsrl.vi v8, v8, 1 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vand.vv v8, v8, v12 -; RV32-NEXT: vand.vv v10, v10, v12 -; RV32-NEXT: vadd.vv v8, v8, v8 -; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: vor.vv v8, v8, v10 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -475,21 +475,21 @@ define void @bitreverse_v4i64(ptr %x, ptr %y) { ; RV64-NEXT: vor.vv v8, v10, v8 ; RV64-NEXT: vor.vv v10, v14, v12 ; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 4 +; RV64-NEXT: vand.vx v10, v8, a1 +; RV64-NEXT: vsrl.vi v8, v8, 4 +; RV64-NEXT: vsll.vi v10, v10, 4 ; RV64-NEXT: vand.vx v8, v8, a1 -; RV64-NEXT: vand.vx v10, v10, a1 -; RV64-NEXT: vsll.vi v8, v8, 4 -; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 2 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a2 +; RV64-NEXT: vsrl.vi v8, v8, 2 +; RV64-NEXT: vsll.vi v10, v10, 2 ; RV64-NEXT: vand.vx v8, v8, a2 -; RV64-NEXT: vand.vx v10, v10, a2 -; RV64-NEXT: vsll.vi v8, v8, 2 -; RV64-NEXT: vor.vv v8, v10, v8 -; RV64-NEXT: vsrl.vi v10, v8, 1 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vand.vx v10, v8, a3 +; RV64-NEXT: vsrl.vi v8, v8, 1 +; RV64-NEXT: vadd.vv v10, v10, v10 ; RV64-NEXT: vand.vx v8, v8, a3 -; RV64-NEXT: vand.vx v10, v10, a3 -; RV64-NEXT: vadd.vv v8, v8, v8 -; RV64-NEXT: vor.vv v8, v10, v8 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll index b7ca932bb1c45..d99c239de8f1d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll @@ -114,12 +114,12 @@ define <2 x i32> @vp_bswap_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t -; CHECK-NEXT: vor.vv v9, v9, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v9, v10, v9, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v10, v10, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t @@ -134,12 +134,12 @@ define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v2i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsrl.vi v9, v8, 24 +; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vor.vv v9, v9, v10 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vor.vv v9, v10, v9 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vsll.vi v8, v8, 24 @@ -156,12 +156,12 @@ define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v9, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0, v0.t -; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t -; CHECK-NEXT: vor.vv v9, v9, v10, v0.t +; CHECK-NEXT: vand.vx v10, v10, a0, v0.t +; CHECK-NEXT: vor.vv v9, v10, v9, v0.t ; CHECK-NEXT: vand.vx v10, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v10, v10, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t @@ -176,12 +176,12 @@ define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v4i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsrl.vi v9, v8, 24 +; CHECK-NEXT: vsrl.vi v10, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v10, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v9, v9, a0 -; CHECK-NEXT: vor.vv v9, v9, v10 +; CHECK-NEXT: vand.vx v10, v10, a0 +; CHECK-NEXT: vor.vv v9, v10, v9 ; CHECK-NEXT: vand.vx v10, v8, a0 ; CHECK-NEXT: vsll.vi v10, v10, 8 ; CHECK-NEXT: vsll.vi v8, v8, 24 @@ -198,12 +198,12 @@ define <8 x i32> @vp_bswap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsrl.vi v10, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v10, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v12, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v10, v10, a0, v0.t -; CHECK-NEXT: vsrl.vi v12, v8, 24, v0.t -; CHECK-NEXT: vor.vv v10, v10, v12, v0.t +; CHECK-NEXT: vand.vx v12, v12, a0, v0.t +; CHECK-NEXT: vor.vv v10, v12, v10, v0.t ; CHECK-NEXT: vand.vx v12, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v12, v12, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t @@ -218,12 +218,12 @@ define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v8i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vsrl.vi v10, v8, 8 +; CHECK-NEXT: vsrl.vi v10, v8, 24 +; CHECK-NEXT: vsrl.vi v12, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v12, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v10, v10, a0 -; CHECK-NEXT: vor.vv v10, v10, v12 +; CHECK-NEXT: vand.vx v12, v12, a0 +; CHECK-NEXT: vor.vv v10, v12, v10 ; CHECK-NEXT: vand.vx v12, v8, a0 ; CHECK-NEXT: vsll.vi v12, v12, 8 ; CHECK-NEXT: vsll.vi v8, v8, 24 @@ -240,12 +240,12 @@ define <16 x i32> @vp_bswap_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %ev ; CHECK-LABEL: vp_bswap_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsrl.vi v12, v8, 8, v0.t +; CHECK-NEXT: vsrl.vi v12, v8, 24, v0.t ; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: vsrl.vi v16, v8, 8, v0.t ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v12, v12, a0, v0.t -; CHECK-NEXT: vsrl.vi v16, v8, 24, v0.t -; CHECK-NEXT: vor.vv v12, v12, v16, v0.t +; CHECK-NEXT: vand.vx v16, v16, a0, v0.t +; CHECK-NEXT: vor.vv v12, v16, v12, v0.t ; CHECK-NEXT: vand.vx v16, v8, a0, v0.t ; CHECK-NEXT: vsll.vi v16, v16, 8, v0.t ; CHECK-NEXT: vsll.vi v8, v8, 24, v0.t @@ -260,12 +260,12 @@ define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_bswap_v16i32_unmasked: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; CHECK-NEXT: vsrl.vi v12, v8, 8 +; CHECK-NEXT: vsrl.vi v12, v8, 24 +; CHECK-NEXT: vsrl.vi v16, v8, 8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: vsrl.vi v16, v8, 24 ; CHECK-NEXT: addi a0, a0, -256 -; CHECK-NEXT: vand.vx v12, v12, a0 -; CHECK-NEXT: vor.vv v12, v12, v16 +; CHECK-NEXT: vand.vx v16, v16, a0 +; CHECK-NEXT: vor.vv v12, v16, v12 ; CHECK-NEXT: vand.vx v16, v8, a0 ; CHECK-NEXT: vsll.vi v16, v16, 8 ; CHECK-NEXT: vsll.vi v8, v8, 24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index 61730b87c5517..98e1f43ac34d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -42,7 +42,10 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: li a1, 134 ; RVF-NEXT: vzext.vf2 v12, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v8, v12, 0 ; RVF-NEXT: vrsub.vx v8, v8, a1 @@ -58,7 +61,10 @@ define void @ctlz_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: li a1, 134 ; RVD-NEXT: vzext.vf2 v12, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v8, v12, 0 ; RVD-NEXT: vrsub.vx v8, v8, a1 @@ -124,7 +130,10 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vle16.v v10, (a0) ; RVF-NEXT: li a1, 142 ; RVF-NEXT: vfwcvt.f.xu.v v8, v10 -; RVF-NEXT: vnsrl.wi v10, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVF-NEXT: vnsrl.wi v10, v8, 0 ; RVF-NEXT: vrsub.vx v8, v10, a1 ; RVF-NEXT: li a1, 16 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -137,7 +146,10 @@ define void @ctlz_v8i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle16.v v10, (a0) ; RVD-NEXT: li a1, 142 ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wi v10, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: vrsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 16 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -219,8 +231,11 @@ define void @ctlz_v4i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle32.v v10, (a0) ; RVD-NEXT: li a1, 52 ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wx v10, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 ; RVD-NEXT: li a1, 1054 +; RVD-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: vrsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -351,16 +366,16 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RVF-NEXT: vle64.v v8, (a0) -; RVF-NEXT: li a1, 190 -; RVF-NEXT: vmv.v.x v9, a1 ; RVF-NEXT: fsrmi a1, 1 -; RVF-NEXT: vfncvt.f.xu.w v10, v8 +; RVF-NEXT: vfncvt.f.xu.w v9, v8 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v8, v10, 23 -; RVF-NEXT: vwsubu.vv v10, v9, v8 -; RVF-NEXT: li a1, 64 +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vsrl.vi v8, v9, 23 ; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RVF-NEXT: vminu.vx v8, v10, a1 +; RVF-NEXT: vzext.vf2 v9, v8 +; RVF-NEXT: vrsub.vx v8, v9, a1 +; RVF-NEXT: li a1, 64 +; RVF-NEXT: vminu.vx v8, v8, a1 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; @@ -431,7 +446,10 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: li a1, 134 ; RVF-NEXT: vzext.vf2 v16, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v16 -; RVF-NEXT: vnsrl.wi v16, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVF-NEXT: vnsrl.wi v16, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v8, v16, 0 ; RVF-NEXT: vrsub.vx v8, v8, a1 @@ -448,7 +466,10 @@ define void @ctlz_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: li a1, 134 ; RVD-NEXT: vzext.vf2 v16, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v16 -; RVD-NEXT: vnsrl.wi v16, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVD-NEXT: vnsrl.wi v16, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v8, v16, 0 ; RVD-NEXT: vrsub.vx v8, v8, a1 @@ -515,7 +536,10 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vle16.v v12, (a0) ; RVF-NEXT: li a1, 142 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: vrsub.vx v8, v12, a1 ; RVF-NEXT: li a1, 16 ; RVF-NEXT: vminu.vx v8, v8, a1 @@ -528,7 +552,10 @@ define void @ctlz_v16i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle16.v v12, (a0) ; RVD-NEXT: li a1, 142 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vrsub.vx v8, v12, a1 ; RVD-NEXT: li a1, 16 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -610,8 +637,11 @@ define void @ctlz_v8i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle32.v v12, (a0) ; RVD-NEXT: li a1, 52 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wx v12, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 ; RVD-NEXT: li a1, 1054 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vrsub.vx v8, v12, a1 ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vminu.vx v8, v8, a1 @@ -742,15 +772,15 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle64.v v8, (a0) -; RVF-NEXT: li a1, 190 -; RVF-NEXT: vmv.v.x v10, a1 ; RVF-NEXT: fsrmi a1, 1 -; RVF-NEXT: vfncvt.f.xu.w v11, v8 +; RVF-NEXT: vfncvt.f.xu.w v10, v8 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v11, v11, 23 -; RVF-NEXT: vwsubu.vv v8, v10, v11 -; RVF-NEXT: li a1, 64 +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vsrl.vi v10, v10, 23 ; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vzext.vf2 v8, v10 +; RVF-NEXT: vrsub.vx v8, v8, a1 +; RVF-NEXT: li a1, 64 ; RVF-NEXT: vminu.vx v8, v8, a1 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret @@ -819,7 +849,10 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vle8.v v8, (a0) ; RVF-NEXT: vzext.vf2 v12, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v8, v12, 0 ; RVF-NEXT: li a1, 134 @@ -833,7 +866,10 @@ define void @ctlz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle8.v v8, (a0) ; RVD-NEXT: vzext.vf2 v12, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v8, v12, 0 ; RVD-NEXT: li a1, 134 @@ -896,7 +932,10 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVF-NEXT: vle16.v v10, (a0) ; RVF-NEXT: vfwcvt.f.xu.v v8, v10 -; RVF-NEXT: vnsrl.wi v10, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVF-NEXT: vnsrl.wi v10, v8, 0 ; RVF-NEXT: li a1, 142 ; RVF-NEXT: vrsub.vx v8, v10, a1 ; RVF-NEXT: vse16.v v8, (a0) @@ -907,7 +946,10 @@ define void @ctlz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; RVD-NEXT: vle16.v v10, (a0) ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wi v10, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: li a1, 142 ; RVD-NEXT: vrsub.vx v8, v10, a1 ; RVD-NEXT: vse16.v v8, (a0) @@ -985,7 +1027,10 @@ define void @ctlz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle32.v v10, (a0) ; RVD-NEXT: li a1, 52 ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wx v10, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: li a1, 1054 ; RVD-NEXT: vrsub.vx v8, v10, a1 ; RVD-NEXT: vse32.v v8, (a0) @@ -1114,14 +1159,15 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RVF-NEXT: vle64.v v8, (a0) -; RVF-NEXT: li a1, 190 -; RVF-NEXT: vmv.v.x v9, a1 ; RVF-NEXT: fsrmi a1, 1 -; RVF-NEXT: vfncvt.f.xu.w v10, v8 +; RVF-NEXT: vfncvt.f.xu.w v9, v8 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v8, v10, 23 -; RVF-NEXT: vwsubu.vv v10, v9, v8 -; RVF-NEXT: vse64.v v10, (a0) +; RVF-NEXT: vsrl.vi v8, v9, 23 +; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RVF-NEXT: vzext.vf2 v9, v8 +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vrsub.vx v8, v9, a1 +; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; ; RVD-LABEL: ctlz_zero_undef_v2i64: @@ -1187,7 +1233,10 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vle8.v v8, (a0) ; RVF-NEXT: vzext.vf2 v16, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v16 -; RVF-NEXT: vnsrl.wi v16, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVF-NEXT: vnsrl.wi v16, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v8, v16, 0 ; RVF-NEXT: li a1, 134 @@ -1202,7 +1251,10 @@ define void @ctlz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle8.v v8, (a0) ; RVD-NEXT: vzext.vf2 v16, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v16 -; RVD-NEXT: vnsrl.wi v16, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVD-NEXT: vnsrl.wi v16, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v8, v16, 0 ; RVD-NEXT: li a1, 134 @@ -1266,7 +1318,10 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVF-NEXT: vle16.v v12, (a0) ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: li a1, 142 ; RVF-NEXT: vrsub.vx v8, v12, a1 ; RVF-NEXT: vse16.v v8, (a0) @@ -1277,7 +1332,10 @@ define void @ctlz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RVD-NEXT: vle16.v v12, (a0) ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: li a1, 142 ; RVD-NEXT: vrsub.vx v8, v12, a1 ; RVD-NEXT: vse16.v v8, (a0) @@ -1355,7 +1413,10 @@ define void @ctlz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vle32.v v12, (a0) ; RVD-NEXT: li a1, 52 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wx v12, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: li a1, 1054 ; RVD-NEXT: vrsub.vx v8, v12, a1 ; RVD-NEXT: vse32.v v8, (a0) @@ -1484,13 +1545,14 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVF: # %bb.0: ; RVF-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RVF-NEXT: vle64.v v8, (a0) -; RVF-NEXT: li a1, 190 -; RVF-NEXT: vmv.v.x v10, a1 ; RVF-NEXT: fsrmi a1, 1 -; RVF-NEXT: vfncvt.f.xu.w v11, v8 +; RVF-NEXT: vfncvt.f.xu.w v10, v8 ; RVF-NEXT: fsrm a1 -; RVF-NEXT: vsrl.vi v11, v11, 23 -; RVF-NEXT: vwsubu.vv v8, v10, v11 +; RVF-NEXT: vsrl.vi v10, v10, 23 +; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vzext.vf2 v8, v10 +; RVF-NEXT: li a1, 190 +; RVF-NEXT: vrsub.vx v8, v8, a1 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index 307b143f4449f..8b8471527991c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -41,7 +41,10 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RVF-NEXT: vzext.vf2 v12, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v8, v12, 0 ; RVF-NEXT: vmseq.vi v0, v14, 0 @@ -60,7 +63,10 @@ define void @cttz_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RVD-NEXT: vzext.vf2 v12, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v8, v12, 0 ; RVD-NEXT: vmseq.vi v0, v14, 0 @@ -123,7 +129,10 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vmseq.vi v0, v8, 0 ; RVF-NEXT: vand.vv v10, v8, v9 ; RVF-NEXT: vfwcvt.f.xu.v v8, v10 -; RVF-NEXT: vnsrl.wi v10, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVF-NEXT: vnsrl.wi v10, v8, 0 ; RVF-NEXT: vsub.vx v8, v10, a1 ; RVF-NEXT: li a1, 16 ; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 @@ -139,7 +148,10 @@ define void @cttz_v8i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vmseq.vi v0, v8, 0 ; RVD-NEXT: vand.vv v10, v8, v9 ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wi v10, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 16 ; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 @@ -218,10 +230,13 @@ define void @cttz_v4i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vrsub.vi v8, v10, 0 ; RVD-NEXT: vand.vv v11, v10, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v11 -; RVD-NEXT: vnsrl.wx v11, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 ; RVD-NEXT: li a1, 1023 +; RVD-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RVD-NEXT: vmseq.vi v0, v10, 0 -; RVD-NEXT: vsub.vx v8, v11, a1 +; RVD-NEXT: vnsrl.wi v10, v8, 0 +; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse32.v v8, (a0) @@ -340,10 +355,11 @@ define void @cttz_v2i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vmseq.vi v0, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RVF-NEXT: vsrl.vi v8, v10, 23 -; RVF-NEXT: vwsubu.vx v9, v8, a1 -; RVF-NEXT: li a1, 64 ; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RVF-NEXT: vmerge.vxm v8, v9, a1, v0 +; RVF-NEXT: vzext.vf2 v9, v8 +; RVF-NEXT: vsub.vx v8, v9, a1 +; RVF-NEXT: li a1, 64 +; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; @@ -416,7 +432,10 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; RVF-NEXT: vzext.vf2 v16, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v16 -; RVF-NEXT: vnsrl.wi v16, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVF-NEXT: vnsrl.wi v16, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v8, v16, 0 ; RVF-NEXT: vmseq.vi v0, v20, 0 @@ -436,7 +455,10 @@ define void @cttz_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; RVD-NEXT: vzext.vf2 v16, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v16 -; RVD-NEXT: vnsrl.wi v16, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVD-NEXT: vnsrl.wi v16, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v8, v16, 0 ; RVD-NEXT: vmseq.vi v0, v20, 0 @@ -500,7 +522,10 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vmseq.vi v0, v8, 0 ; RVF-NEXT: vand.vv v12, v8, v10 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: vsub.vx v8, v12, a1 ; RVF-NEXT: li a1, 16 ; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 @@ -516,7 +541,10 @@ define void @cttz_v16i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vmseq.vi v0, v8, 0 ; RVD-NEXT: vand.vv v12, v8, v10 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vsub.vx v8, v12, a1 ; RVD-NEXT: li a1, 16 ; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 @@ -595,10 +623,13 @@ define void @cttz_v8i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vrsub.vi v8, v12, 0 ; RVD-NEXT: vand.vv v14, v12, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v14 -; RVD-NEXT: vnsrl.wx v14, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 ; RVD-NEXT: li a1, 1023 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RVD-NEXT: vmseq.vi v0, v12, 0 -; RVD-NEXT: vsub.vx v8, v14, a1 +; RVD-NEXT: vnsrl.wi v12, v8, 0 +; RVD-NEXT: vsub.vx v8, v12, a1 ; RVD-NEXT: li a1, 32 ; RVD-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVD-NEXT: vse32.v v8, (a0) @@ -717,9 +748,10 @@ define void @cttz_v4i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vmseq.vi v0, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RVF-NEXT: vsrl.vi v10, v12, 23 -; RVF-NEXT: vwsubu.vx v8, v10, a1 -; RVF-NEXT: li a1, 64 ; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vzext.vf2 v8, v10 +; RVF-NEXT: vsub.vx v8, v8, a1 +; RVF-NEXT: li a1, 64 ; RVF-NEXT: vmerge.vxm v8, v8, a1, v0 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret @@ -790,7 +822,10 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RVF-NEXT: vzext.vf2 v12, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVF-NEXT: vnsrl.wi v8, v12, 0 ; RVF-NEXT: li a1, 127 @@ -807,7 +842,10 @@ define void @cttz_zero_undef_v16i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RVD-NEXT: vzext.vf2 v12, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RVD-NEXT: vnsrl.wi v8, v12, 0 ; RVD-NEXT: li a1, 127 @@ -866,7 +904,10 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vrsub.vi v9, v8, 0 ; RVF-NEXT: vand.vv v10, v8, v9 ; RVF-NEXT: vfwcvt.f.xu.v v8, v10 -; RVF-NEXT: vnsrl.wi v10, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVF-NEXT: vnsrl.wi v10, v8, 0 ; RVF-NEXT: li a1, 127 ; RVF-NEXT: vsub.vx v8, v10, a1 ; RVF-NEXT: vse16.v v8, (a0) @@ -879,7 +920,10 @@ define void @cttz_zero_undef_v8i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v10, v8, v9 ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wi v10, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: li a1, 127 ; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: vse16.v v8, (a0) @@ -953,7 +997,10 @@ define void @cttz_zero_undef_v4i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vrsub.vi v9, v8, 0 ; RVD-NEXT: vand.vv v10, v8, v9 ; RVD-NEXT: vfwcvt.f.xu.v v8, v10 -; RVD-NEXT: vnsrl.wx v10, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RVD-NEXT: vnsrl.wi v10, v8, 0 ; RVD-NEXT: li a1, 1023 ; RVD-NEXT: vsub.vx v8, v10, a1 ; RVD-NEXT: vse32.v v8, (a0) @@ -1067,9 +1114,11 @@ define void @cttz_zero_undef_v2i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vfncvt.f.xu.w v9, v8 ; RVF-NEXT: fsrm a1 ; RVF-NEXT: vsrl.vi v8, v9, 23 +; RVF-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RVF-NEXT: vzext.vf2 v9, v8 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vwsubu.vx v9, v8, a1 -; RVF-NEXT: vse64.v v9, (a0) +; RVF-NEXT: vsub.vx v8, v9, a1 +; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; ; RVD-LABEL: cttz_zero_undef_v2i64: @@ -1136,7 +1185,10 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; RVF-NEXT: vzext.vf2 v16, v8 ; RVF-NEXT: vfwcvt.f.xu.v v8, v16 -; RVF-NEXT: vnsrl.wi v16, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVF-NEXT: vnsrl.wi v16, v8, 0 ; RVF-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVF-NEXT: vnsrl.wi v8, v16, 0 ; RVF-NEXT: li a1, 127 @@ -1154,7 +1206,10 @@ define void @cttz_zero_undef_v32i8(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; RVD-NEXT: vzext.vf2 v16, v8 ; RVD-NEXT: vfwcvt.f.xu.v v8, v16 -; RVD-NEXT: vnsrl.wi v16, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; RVD-NEXT: vnsrl.wi v16, v8, 0 ; RVD-NEXT: vsetvli zero, zero, e8, m2, ta, ma ; RVD-NEXT: vnsrl.wi v8, v16, 0 ; RVD-NEXT: li a1, 127 @@ -1214,7 +1269,10 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vrsub.vi v10, v8, 0 ; RVF-NEXT: vand.vv v12, v8, v10 ; RVF-NEXT: vfwcvt.f.xu.v v8, v12 -; RVF-NEXT: vnsrl.wi v12, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVF-NEXT: vsrl.vi v8, v8, 23 +; RVF-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVF-NEXT: vnsrl.wi v12, v8, 0 ; RVF-NEXT: li a1, 127 ; RVF-NEXT: vsub.vx v8, v12, a1 ; RVF-NEXT: vse16.v v8, (a0) @@ -1227,7 +1285,10 @@ define void @cttz_zero_undef_v16i16(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v12, v8, v10 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wi v12, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; RVD-NEXT: vsrl.vi v8, v8, 23 +; RVD-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: li a1, 127 ; RVD-NEXT: vsub.vx v8, v12, a1 ; RVD-NEXT: vse16.v v8, (a0) @@ -1301,7 +1362,10 @@ define void @cttz_zero_undef_v8i32(ptr %x, ptr %y) nounwind { ; RVD-NEXT: vrsub.vi v10, v8, 0 ; RVD-NEXT: vand.vv v12, v8, v10 ; RVD-NEXT: vfwcvt.f.xu.v v8, v12 -; RVD-NEXT: vnsrl.wx v12, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RVD-NEXT: vsrl.vx v8, v8, a1 +; RVD-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RVD-NEXT: vnsrl.wi v12, v8, 0 ; RVD-NEXT: li a1, 1023 ; RVD-NEXT: vsub.vx v8, v12, a1 ; RVD-NEXT: vse32.v v8, (a0) @@ -1415,8 +1479,10 @@ define void @cttz_zero_undef_v4i64(ptr %x, ptr %y) nounwind { ; RVF-NEXT: vfncvt.f.xu.w v10, v8 ; RVF-NEXT: fsrm a1 ; RVF-NEXT: vsrl.vi v10, v10, 23 +; RVF-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RVF-NEXT: vzext.vf2 v8, v10 ; RVF-NEXT: li a1, 127 -; RVF-NEXT: vwsubu.vx v8, v10, a1 +; RVF-NEXT: vsub.vx v8, v8, a1 ; RVF-NEXT: vse64.v v8, (a0) ; RVF-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll index dba5d26c216fa..24e8a1c1dcf02 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract.ll @@ -1319,9 +1319,9 @@ define i32 @extractelt_sdiv_v4i32(<4 x i32> %x) { ; RV32NOM-NEXT: vmv.s.x v9, a0 ; RV32NOM-NEXT: vsext.vf4 v11, v9 ; RV32NOM-NEXT: vadd.vv v8, v8, v10 -; RV32NOM-NEXT: vsra.vv v9, v8, v11 -; RV32NOM-NEXT: vsrl.vi v8, v8, 31 -; RV32NOM-NEXT: vadd.vv v8, v9, v8 +; RV32NOM-NEXT: vsrl.vi v9, v8, 31 +; RV32NOM-NEXT: vsra.vv v8, v8, v11 +; RV32NOM-NEXT: vadd.vv v8, v8, v9 ; RV32NOM-NEXT: vslidedown.vi v8, v8, 2 ; RV32NOM-NEXT: vmv.x.s a0, v8 ; RV32NOM-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll index 2bf039bd0104a..85c4811f792f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll @@ -28,14 +28,17 @@ define <2 x half> @vfmax_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v10, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 -; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v9, v11, v9, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12, v0.t +; ZVFHMIN-NEXT: vmerge.vvm v9, v12, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 ; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -96,14 +99,17 @@ define <4 x half> @vfmax_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v10, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 -; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v9, v11, v9, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12, v0.t +; ZVFHMIN-NEXT: vmerge.vvm v9, v12, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 ; ZVFHMIN-NEXT: vfmax.vv v9, v9, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -164,16 +170,18 @@ define <8 x half> @vfmax_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v10, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v14, v14, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v16, v14, v12, v0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v16, v12, v14, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v14, v14, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v14, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v14, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 ; ZVFHMIN-NEXT: vfmax.vv v12, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -236,16 +244,18 @@ define <16 x half> @vfmax_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v12, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v20, v20, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v24, v20, v16, v0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v20, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v12 -; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v20, v20, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v20, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v20, v16, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vfmax.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll index c6cd366497218..e7821cba96ddc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll @@ -293,11 +293,11 @@ define <2 x half> @vfmax_v2f16_vv_nnan(<2 x half> %a, <2 x half> %b) { define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmax_v2f16_vv_nnana: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v9, v9 -; ZVFH-NEXT: vmv1r.v v10, v9 -; ZVFH-NEXT: vfadd.vv v10, v8, v8, v0.t -; ZVFH-NEXT: vfmax.vv v8, v10, v9 +; ZVFH-NEXT: vfadd.vv v8, v8, v8 +; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 +; ZVFH-NEXT: vfmax.vv v8, v8, v9 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnana: @@ -327,11 +327,11 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmax_v2f16_vv_nnanb: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmv1r.v v10, v8 -; ZVFH-NEXT: vfadd.vv v10, v9, v9, v0.t -; ZVFH-NEXT: vfmax.vv v8, v8, v10 +; ZVFH-NEXT: vfadd.vv v9, v9, v9 +; ZVFH-NEXT: vmerge.vvm v9, v8, v9, v0 +; ZVFH-NEXT: vfmax.vv v8, v8, v9 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vfmax_v2f16_vv_nnanb: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll index 73d83e86af4c6..f8e6b3bf72ac1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll @@ -28,14 +28,17 @@ define <2 x half> @vfmin_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v10, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 -; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v9, v11, v9, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12, v0.t +; ZVFHMIN-NEXT: vmerge.vvm v9, v12, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 ; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -96,14 +99,17 @@ define <4 x half> @vfmin_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v10, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9, v0.t +; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 -; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11, v0.t -; ZVFHMIN-NEXT: vmerge.vvm v9, v11, v9, v0 +; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12, v0.t +; ZVFHMIN-NEXT: vmerge.vvm v9, v12, v11, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 ; ZVFHMIN-NEXT: vfmin.vv v9, v9, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -164,16 +170,18 @@ define <8 x half> @vfmin_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v10, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v14, v14, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v14, v9, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v16, v14, v12, v0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v16, v12, v14, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v14, v14, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v14, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v14, v12, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v10 ; ZVFHMIN-NEXT: vfmin.vv v12, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -236,16 +244,18 @@ define <16 x half> @vfmin_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v12, v0 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10, v0.t -; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v20, v20, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16, v0.t +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v20, v10, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v24, v20, v16, v0 +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFHMIN-NEXT: vmerge.vvm v24, v16, v20, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v12 -; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16, v0.t +; ZVFHMIN-NEXT: vmfeq.vv v8, v20, v20, v0.t ; ZVFHMIN-NEXT: vmv1r.v v0, v8 -; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v20, v0 +; ZVFHMIN-NEXT: vmerge.vvm v8, v20, v16, v0 ; ZVFHMIN-NEXT: vmv1r.v v0, v12 ; ZVFHMIN-NEXT: vfmin.vv v16, v8, v24, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll index 568923db83591..5da81fc76629e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll @@ -293,11 +293,11 @@ define <2 x half> @vfmin_v2f16_vv_nnan(<2 x half> %a, <2 x half> %b) { define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmin_v2f16_vv_nnana: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v9, v9 -; ZVFH-NEXT: vmv1r.v v10, v9 -; ZVFH-NEXT: vfadd.vv v10, v8, v8, v0.t -; ZVFH-NEXT: vfmin.vv v8, v10, v9 +; ZVFH-NEXT: vfadd.vv v8, v8, v8 +; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 +; ZVFH-NEXT: vfmin.vv v8, v8, v9 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnana: @@ -327,11 +327,11 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFH-LABEL: vfmin_v2f16_vv_nnanb: ; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, mu +; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmv1r.v v10, v8 -; ZVFH-NEXT: vfadd.vv v10, v9, v9, v0.t -; ZVFH-NEXT: vfmin.vv v8, v8, v10 +; ZVFH-NEXT: vfadd.vv v9, v9, v9 +; ZVFH-NEXT: vmerge.vvm v9, v8, v9, v0 +; ZVFH-NEXT: vfmin.vv v8, v8, v9 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: vfmin_v2f16_vv_nnanb: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll index abb929eaaf6e6..9c79a5233403f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll @@ -470,7 +470,8 @@ define void @fcmp_ueq_vv_v32f32(ptr %x, ptr %y, ptr %z) { ; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: vmflt.vv v24, v8, v16 ; CHECK-NEXT: vmflt.vv v25, v16, v8 -; CHECK-NEXT: vmnor.mm v8, v25, v24 +; CHECK-NEXT: vmor.mm v8, v25, v24 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a2) ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x @@ -1102,7 +1103,8 @@ define void @fcmp_ueq_vf_v32f32(ptr %x, float %y, ptr %z) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmflt.vf v16, v8, fa0 ; CHECK-NEXT: vmfgt.vf v17, v8, fa0 -; CHECK-NEXT: vmnor.mm v8, v17, v16 +; CHECK-NEXT: vmor.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a1) ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x @@ -1743,7 +1745,8 @@ define void @fcmp_ueq_fv_v32f32(ptr %x, float %y, ptr %z) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmfgt.vf v16, v8, fa0 ; CHECK-NEXT: vmflt.vf v17, v8, fa0 -; CHECK-NEXT: vmnor.mm v8, v17, v16 +; CHECK-NEXT: vmor.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a1) ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 38df622998bf9..a9c23851d3d44 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -1067,9 +1067,10 @@ define void @copysign_neg_v8f16(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vfsgnjn.vv v8, v8, v9 +; ZVFH-NEXT: vle16.v v8, (a1) +; ZVFH-NEXT: vle16.v v9, (a0) +; ZVFH-NEXT: vfneg.v v8, v8 +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1098,9 +1099,10 @@ define void @copysign_neg_v6f16(ptr %x, ptr %y) { ; ZVFH-LABEL: copysign_neg_v6f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vfsgnjn.vv v8, v8, v9 +; ZVFH-NEXT: vle16.v v8, (a1) +; ZVFH-NEXT: vle16.v v9, (a0) +; ZVFH-NEXT: vfneg.v v8, v8 +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1129,9 +1131,10 @@ define void @copysign_neg_v4f32(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_neg_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vfsgnjn.vv v8, v8, v9 +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfsgnj.vv v8, v9, v8 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x @@ -1146,9 +1149,10 @@ define void @copysign_neg_v2f64(ptr %x, ptr %y) { ; CHECK-LABEL: copysign_neg_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: vfsgnjn.vv v8, v8, v9 +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfsgnj.vv v8, v9, v8 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x @@ -1214,7 +1218,8 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vle32.v v8, (a1) ; ZVFH-NEXT: vle16.v v9, (a0) ; ZVFH-NEXT: vfncvt.f.f.w v10, v8 -; ZVFH-NEXT: vfsgnjn.vv v8, v9, v10 +; ZVFH-NEXT: vfneg.v v8, v10 +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1248,7 +1253,8 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) { ; ZVFH-NEXT: vle32.v v8, (a1) ; ZVFH-NEXT: vle16.v v9, (a0) ; ZVFH-NEXT: vfncvt.f.f.w v10, v8 -; ZVFH-NEXT: vfsgnjn.vv v8, v9, v10 +; ZVFH-NEXT: vfneg.v v8, v10 +; ZVFH-NEXT: vfsgnj.vv v8, v9, v8 ; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; @@ -1283,7 +1289,8 @@ define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) { ; CHECK-NEXT: vle64.v v9, (a0) ; CHECK-NEXT: vfwcvt.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfsgnjn.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfsgnj.vv v8, v9, v8 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x @@ -1622,10 +1629,11 @@ define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vfmsac.vv v10, v8, v9 -; ZVFH-NEXT: vse16.v v10, (a0) +; ZVFH-NEXT: vle16.v v9, (a2) +; ZVFH-NEXT: vle16.v v10, (a1) +; ZVFH-NEXT: vfneg.v v9, v9 +; ZVFH-NEXT: vfmacc.vv v9, v8, v10 +; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fmsub_v8f16: @@ -1659,10 +1667,11 @@ define void @fmsub_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vfmsac.vv v10, v8, v9 -; ZVFH-NEXT: vse16.v v10, (a0) +; ZVFH-NEXT: vle16.v v9, (a2) +; ZVFH-NEXT: vle16.v v10, (a1) +; ZVFH-NEXT: vfneg.v v9, v9 +; ZVFH-NEXT: vfmacc.vv v9, v8, v10 +; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fmsub_v6f16: @@ -1698,8 +1707,9 @@ define void @fnmsub_v4f32(ptr %x, ptr %y, ptr %z) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) ; CHECK-NEXT: vle32.v v10, (a2) -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 -; CHECK-NEXT: vse32.v v10, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -1714,11 +1724,13 @@ define void @fnmadd_v2f64(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fnmadd_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: vle64.v v10, (a2) -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 -; CHECK-NEXT: vse64.v v10, (a0) +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vle64.v v9, (a2) +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v9, v10, v8 +; CHECK-NEXT: vse64.v v9, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = load <2 x double>, ptr %y @@ -2249,13 +2261,13 @@ define void @fadd_vf_v6bf16(ptr %x, bfloat %y) { ; CHECK-LABEL: fadd_vf_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfadd.vv v8, v8, v10 +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -2311,13 +2323,13 @@ define void @fadd_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-LABEL: fadd_vf_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v10, (a0) +; ZVFHMIN-NEXT: vle16.v v12, (a0) ; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfadd.vv v8, v8, v10 +; ZVFHMIN-NEXT: vfadd.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -2529,13 +2541,13 @@ define void @fsub_vf_v6bf16(ptr %x, bfloat %y) { ; CHECK-LABEL: fsub_vf_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfsub.vv v8, v8, v10 +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -2591,13 +2603,13 @@ define void @fsub_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-LABEL: fsub_vf_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v10, (a0) +; ZVFHMIN-NEXT: vle16.v v12, (a0) ; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfsub.vv v8, v8, v10 +; ZVFHMIN-NEXT: vfsub.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -2809,13 +2821,13 @@ define void @fmul_vf_v6bf16(ptr %x, bfloat %y) { ; CHECK-LABEL: fmul_vf_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmul.vv v8, v8, v10 +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -2871,13 +2883,13 @@ define void @fmul_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-LABEL: fmul_vf_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v10, (a0) +; ZVFHMIN-NEXT: vle16.v v12, (a0) ; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v8, v8, v10 +; ZVFHMIN-NEXT: vfmul.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -3089,13 +3101,13 @@ define void @fdiv_vf_v6bf16(ptr %x, bfloat %y) { ; CHECK-LABEL: fdiv_vf_v6bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfdiv.vv v8, v8, v10 +; CHECK-NEXT: vfdiv.vv v8, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) @@ -3151,13 +3163,13 @@ define void @fdiv_vf_v6f16(ptr %x, half %y) { ; ZVFHMIN-LABEL: fdiv_vf_v6f16: ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v10, (a0) +; ZVFHMIN-NEXT: vle16.v v12, (a0) ; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v12, a1 +; ZVFHMIN-NEXT: vmv.v.x v10, a1 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v12 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v8, v8, v10 +; ZVFHMIN-NEXT: vfdiv.vv v8, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) @@ -3529,16 +3541,16 @@ define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) { define void @fma_fv_v6bf16(ptr %x, ptr %y, bfloat %z) { ; CHECK-LABEL: fma_fv_v6bf16: ; CHECK: # %bb.0: +; CHECK-NEXT: fmv.x.w a2, fa0 ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: fmv.x.w a1, fa0 -; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vle16.v v14, (a0) +; CHECK-NEXT: vmv.v.x v10, a2 ; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v12 ; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v14 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfmadd.vv v12, v10, v8 +; CHECK-NEXT: vfmadd.vv v12, v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v12 ; CHECK-NEXT: vse16.v v8, (a0) @@ -3599,16 +3611,16 @@ define void @fma_fv_v6f16(ptr %x, ptr %y, half %z) { ; ; ZVFHMIN-LABEL: fma_fv_v6f16: ; ZVFHMIN: # %bb.0: +; ZVFHMIN-NEXT: fmv.x.w a2, fa0 ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFHMIN-NEXT: vle16.v v10, (a1) -; ZVFHMIN-NEXT: vle16.v v12, (a0) -; ZVFHMIN-NEXT: fmv.x.w a1, fa0 -; ZVFHMIN-NEXT: vmv.v.x v14, a1 +; ZVFHMIN-NEXT: vle16.v v12, (a1) +; ZVFHMIN-NEXT: vle16.v v14, (a0) +; ZVFHMIN-NEXT: vmv.v.x v10, a2 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v10 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v12 ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v14 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v12, v10, v8 +; ZVFHMIN-NEXT: vfmadd.vv v12, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v12 ; ZVFHMIN-NEXT: vse16.v v8, (a0) @@ -3720,10 +3732,11 @@ define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) { ; ZVFH-LABEL: fmsub_vf_v8f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vfmsac.vf v9, fa0, v8 -; ZVFH-NEXT: vse16.v v9, (a0) +; ZVFH-NEXT: vle16.v v8, (a1) +; ZVFH-NEXT: vle16.v v9, (a0) +; ZVFH-NEXT: vfneg.v v8, v8 +; ZVFH-NEXT: vfmacc.vf v8, fa0, v9 +; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fmsub_vf_v8f16: @@ -3758,10 +3771,11 @@ define void @fmsub_vf_v6f16(ptr %x, ptr %y, half %z) { ; ZVFH-LABEL: fmsub_vf_v6f16: ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma -; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vfmsac.vf v9, fa0, v8 -; ZVFH-NEXT: vse16.v v9, (a0) +; ZVFH-NEXT: vle16.v v8, (a1) +; ZVFH-NEXT: vle16.v v9, (a0) +; ZVFH-NEXT: vfneg.v v8, v8 +; ZVFH-NEXT: vfmacc.vf v8, fa0, v9 +; ZVFH-NEXT: vse16.v v8, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fmsub_vf_v6f16: @@ -3798,8 +3812,9 @@ define void @fnmsub_vf_v4f32(ptr %x, ptr %y, float %z) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -3817,7 +3832,9 @@ define void @fnmadd_vf_v2f64(ptr %x, ptr %y, double %z) { ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vf v9, fa0, v8 ; CHECK-NEXT: vse64.v v9, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x @@ -3837,8 +3854,10 @@ define void @fnmsub_fv_v4f32(ptr %x, ptr %y, float %z) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v10, v8, v9 +; CHECK-NEXT: vse32.v v10, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -3854,10 +3873,13 @@ define void @fnmadd_fv_v2f64(ptr %x, ptr %y, double %z) { ; CHECK-LABEL: fnmadd_fv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = load <2 x double>, ptr %y @@ -5067,10 +5089,11 @@ define void @fmsub_fmuladd_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vfmsac.vv v10, v8, v9 -; ZVFH-NEXT: vse16.v v10, (a0) +; ZVFH-NEXT: vle16.v v9, (a2) +; ZVFH-NEXT: vle16.v v10, (a1) +; ZVFH-NEXT: vfneg.v v9, v9 +; ZVFH-NEXT: vfmacc.vv v9, v8, v10 +; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fmsub_fmuladd_v8f16: @@ -5107,10 +5130,11 @@ define void @fmsub_fmuladd_v6f16(ptr %x, ptr %y, ptr %z) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: vle16.v v9, (a1) -; ZVFH-NEXT: vle16.v v10, (a2) -; ZVFH-NEXT: vfmsac.vv v10, v8, v9 -; ZVFH-NEXT: vse16.v v10, (a0) +; ZVFH-NEXT: vle16.v v9, (a2) +; ZVFH-NEXT: vle16.v v10, (a1) +; ZVFH-NEXT: vfneg.v v9, v9 +; ZVFH-NEXT: vfmacc.vv v9, v8, v10 +; ZVFH-NEXT: vse16.v v9, (a0) ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fmsub_fmuladd_v6f16: @@ -5149,8 +5173,9 @@ define void @fnmsub_fmuladd_v4f32(ptr %x, ptr %y, ptr %z) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v9, (a1) ; CHECK-NEXT: vle32.v v10, (a2) -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 -; CHECK-NEXT: vse32.v v10, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -5165,11 +5190,13 @@ define void @fnmadd_fmuladd_v2f64(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: fnmadd_fmuladd_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: vle64.v v10, (a2) -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 -; CHECK-NEXT: vse64.v v10, (a0) +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: vle64.v v9, (a2) +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v9, v10, v8 +; CHECK-NEXT: vse64.v v9, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = load <2 x double>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll index 81679806f32d8..0068064ea6ca0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fshr-fshl-vp.ll @@ -384,9 +384,9 @@ define <2 x i32> @fshr_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vand.vx v11, v10, a1, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vx v10, v10, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -402,9 +402,9 @@ define <2 x i32> @fshl_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vand.vx v11, v10, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v11, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vx v10, v10, a1, v0.t -; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -420,9 +420,9 @@ define <4 x i32> @fshr_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vand.vx v11, v10, a1, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vx v10, v10, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -438,9 +438,9 @@ define <4 x i32> @fshl_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vand.vx v11, v10, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v11, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vx v10, v10, a1, v0.t -; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -456,9 +456,9 @@ define <8 x i32> @fshr_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vand.vx v14, v12, a1, v0.t ; CHECK-NEXT: vsrl.vv v10, v10, v14, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v12, v12, v0.t ; CHECK-NEXT: vand.vx v12, v12, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v12, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: ret @@ -474,9 +474,9 @@ define <8 x i32> @fshl_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vand.vx v14, v12, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v14, v0.t +; CHECK-NEXT: vsrl.vi v10, v10, 1, v0.t ; CHECK-NEXT: vnot.v v12, v12, v0.t ; CHECK-NEXT: vand.vx v12, v12, a1, v0.t -; CHECK-NEXT: vsrl.vi v10, v10, 1, v0.t ; CHECK-NEXT: vsrl.vv v10, v10, v12, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: ret @@ -492,9 +492,9 @@ define <16 x i32> @fshr_v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vand.vx v20, v16, a1, v0.t ; CHECK-NEXT: vsrl.vv v12, v12, v20, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v16, v16, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret @@ -510,9 +510,9 @@ define <16 x i32> @fshl_v16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vand.vx v20, v16, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v20, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t ; CHECK-NEXT: vnot.v v16, v16, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t ; CHECK-NEXT: vsrl.vv v12, v12, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret @@ -528,9 +528,9 @@ define <2 x i64> @fshr_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vand.vx v11, v10, a1, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v11, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vx v10, v10, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -546,9 +546,9 @@ define <2 x i64> @fshl_v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vand.vx v11, v10, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v11, v0.t +; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t ; CHECK-NEXT: vnot.v v10, v10, v0.t ; CHECK-NEXT: vand.vx v10, v10, a1, v0.t -; CHECK-NEXT: vsrl.vi v9, v9, 1, v0.t ; CHECK-NEXT: vsrl.vv v9, v9, v10, v0.t ; CHECK-NEXT: vor.vv v8, v8, v9, v0.t ; CHECK-NEXT: ret @@ -564,9 +564,9 @@ define <4 x i64> @fshr_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vand.vx v14, v12, a1, v0.t ; CHECK-NEXT: vsrl.vv v10, v10, v14, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v12, v12, v0.t ; CHECK-NEXT: vand.vx v12, v12, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v12, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: ret @@ -582,9 +582,9 @@ define <4 x i64> @fshl_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vand.vx v14, v12, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v14, v0.t +; CHECK-NEXT: vsrl.vi v10, v10, 1, v0.t ; CHECK-NEXT: vnot.v v12, v12, v0.t ; CHECK-NEXT: vand.vx v12, v12, a1, v0.t -; CHECK-NEXT: vsrl.vi v10, v10, 1, v0.t ; CHECK-NEXT: vsrl.vv v10, v10, v12, v0.t ; CHECK-NEXT: vor.vv v8, v8, v10, v0.t ; CHECK-NEXT: ret @@ -600,9 +600,9 @@ define <7 x i64> @fshr_v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vand.vx v20, v16, a1, v0.t ; CHECK-NEXT: vsrl.vv v12, v12, v20, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v16, v16, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret @@ -618,9 +618,9 @@ define <7 x i64> @fshl_v7i64(<7 x i64> %a, <7 x i64> %b, <7 x i64> %c, <7 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vand.vx v20, v16, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v20, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t ; CHECK-NEXT: vnot.v v16, v16, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t ; CHECK-NEXT: vsrl.vv v12, v12, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret @@ -636,9 +636,9 @@ define <8 x i64> @fshr_v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vand.vx v20, v16, a1, v0.t ; CHECK-NEXT: vsrl.vv v12, v12, v20, v0.t +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vnot.v v16, v16, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret @@ -654,9 +654,9 @@ define <8 x i64> @fshl_v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vand.vx v20, v16, a1, v0.t ; CHECK-NEXT: vsll.vv v8, v8, v20, v0.t +; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t ; CHECK-NEXT: vnot.v v16, v16, v0.t ; CHECK-NEXT: vand.vx v16, v16, a1, v0.t -; CHECK-NEXT: vsrl.vi v12, v12, 1, v0.t ; CHECK-NEXT: vsrl.vv v12, v12, v16, v0.t ; CHECK-NEXT: vor.vv v8, v8, v12, v0.t ; CHECK-NEXT: ret @@ -682,12 +682,12 @@ define <16 x i64> @fshr_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t ; CHECK-NEXT: vsrl.vv v16, v16, v8, v0.t -; CHECK-NEXT: vnot.v v8, v24, v0.t -; CHECK-NEXT: vand.vx v8, v8, a0, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsll.vi v24, v24, 1, v0.t -; CHECK-NEXT: vsll.vv v8, v24, v8, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsll.vi v8, v8, 1, v0.t +; CHECK-NEXT: vnot.v v24, v24, v0.t +; CHECK-NEXT: vand.vx v24, v24, a0, v0.t +; CHECK-NEXT: vsll.vv v8, v8, v24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -719,12 +719,12 @@ define <16 x i64> @fshl_v16i64(<16 x i64> %a, <16 x i64> %b, <16 x i64> %c, <16 ; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; CHECK-NEXT: vand.vx v8, v24, a0, v0.t ; CHECK-NEXT: vsll.vv v8, v16, v8, v0.t -; CHECK-NEXT: vnot.v v16, v24, v0.t -; CHECK-NEXT: vand.vx v16, v16, a0, v0.t -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsrl.vi v24, v24, 1, v0.t -; CHECK-NEXT: vsrl.vv v16, v24, v16, v0.t +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsrl.vi v16, v16, 1, v0.t +; CHECK-NEXT: vnot.v v24, v24, v0.t +; CHECK-NEXT: vand.vx v24, v24, a0, v0.t +; CHECK-NEXT: vsrl.vv v16, v16, v24, v0.t ; CHECK-NEXT: vor.vv v8, v8, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll index 3012831ed873e..af1aeba610179 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll @@ -17,8 +17,10 @@ declare <4 x i32> @llvm.vp.load.v4i32(ptr, <4 x i1>, i32) define <4 x i32> @insert_subvector_vp_load_v4i32_v4i32(<4 x i32> %v1, ptr %p, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_load_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu -; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0), v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %v2 = call <4 x i32> @llvm.vp.load.v4i32(ptr %p, <4 x i1> %mask, i32 4) %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -57,8 +59,9 @@ define <4 x i32> @insert_subvector_add_v4i32_v4i32(<4 x i32> %v1, <4 x i32> %v2) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vadd.vv v9, v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma -; CHECK-NEXT: vadd.vv v8, v9, v10 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %v3 = add <4 x i32> %v2, %v4 = shufflevector <4 x i32> %v3, <4 x i32> %v1, <4 x i32> @@ -69,8 +72,10 @@ declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) define <4 x i32> @insert_subvector_vp_add_v4i32_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_add_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu -; CHECK-NEXT: vadd.vi v8, v9, 1, v0.t +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; CHECK-NEXT: vadd.vi v9, v9, 1, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %v3 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %v2, <4 x i32> , <4 x i1> %mask, i32 4) %v4 = shufflevector <4 x i32> %v3, <4 x i32> %v1, <4 x i32> @@ -168,8 +173,9 @@ define <4 x i32> @insert_subvector_add_v4i32_v8i32(<4 x i32> %v1, <8 x i32> %v2) ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma ; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma -; CHECK-NEXT: vadd.vv v8, v10, v9 +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %v3 = add <8 x i32> %v2, %v4 = shufflevector <8 x i32> %v3, <8 x i32> poison, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 3a5b3719931a9..890854c69129e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -783,18 +783,32 @@ define void @insertelt_c6_v8i64_store(ptr %x, i32 %idx) { ; Test that using a insertelement at element 0 by a later operation doesn't ; crash the compiler. define void @insertelt_c6_v8i64_0_add(ptr %x, ptr %y) { -; CHECK-LABEL: insertelt_c6_v8i64_0_add: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle64.v v12, (a1) -; CHECK-NEXT: li a1, 6 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma -; CHECK-NEXT: vmv.s.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v12 -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: insertelt_c6_v8i64_0_add: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vle64.v v12, (a1) +; RV32-NEXT: vmv.v.v v8, v8 +; RV32-NEXT: li a1, 6 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_c6_v8i64_0_add: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vle64.v v12, (a1) +; RV64-NEXT: li a1, 6 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: ret ; ; VISNI-LABEL: insertelt_c6_v8i64_0_add: ; VISNI: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index f235540cc8ffb..b676e9781ec6d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -262,10 +262,10 @@ define <4 x i8> @buildvec_vid_stepn3_add3_v4i8() { ; CHECK-LABEL: buildvec_vid_stepn3_add3_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v9, 3 -; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vmv.v.i v8, 3 ; CHECK-NEXT: li a0, -3 -; CHECK-NEXT: vmadd.vx v8, a0, v9 +; CHECK-NEXT: vmacc.vx v8, a0, v9 ; CHECK-NEXT: ret ret <4 x i8> } @@ -274,10 +274,10 @@ define void @buildvec_vid_stepn3_addn3_v4i32(ptr %z0, ptr %z1, ptr %z2, ptr %z3) ; CHECK-LABEL: buildvec_vid_stepn3_addn3_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v8, -3 -; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vmv.v.i v9, -3 ; CHECK-NEXT: li a4, -3 -; CHECK-NEXT: vmadd.vx v9, a4, v8 +; CHECK-NEXT: vmacc.vx v9, a4, v8 ; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: vse32.v v9, (a1) ; CHECK-NEXT: vse32.v v9, (a2) @@ -1261,23 +1261,23 @@ define <8 x i64> @v8xi64_exact_undef_suffix(i64 %a, i64 %b, i64 %c, i64 %d) vsca ; RV32-LABEL: v8xi64_exact_undef_suffix: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 -; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vslide1down.vx v8, v8, a5 -; RV32-NEXT: vslide1down.vx v10, v9, a1 -; RV32-NEXT: vslide1down.vx v8, v8, a6 -; RV32-NEXT: vslide1down.vx v9, v8, a7 -; RV32-NEXT: vslide1down.vx v8, v10, a2 +; RV32-NEXT: vmv.v.x v8, a0 +; RV32-NEXT: vmv.v.x v9, a4 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslide1down.vx v9, v9, a5 +; RV32-NEXT: vslide1down.vx v8, v8, a2 ; RV32-NEXT: vslide1down.vx v8, v8, a3 +; RV32-NEXT: vslide1down.vx v9, v9, a6 +; RV32-NEXT: vslide1down.vx v9, v9, a7 ; RV32-NEXT: ret ; ; RV64V-LABEL: v8xi64_exact_undef_suffix: ; RV64V: # %bb.0: ; RV64V-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64V-NEXT: vmv.v.x v8, a2 -; RV64V-NEXT: vslide1down.vx v9, v8, a3 ; RV64V-NEXT: vmv.v.x v8, a0 ; RV64V-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-NEXT: vmv.v.x v9, a2 +; RV64V-NEXT: vslide1down.vx v9, v9, a3 ; RV64V-NEXT: ret ; ; RV64ZVE32-LABEL: v8xi64_exact_undef_suffix: @@ -2744,13 +2744,15 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV32-ONLY-NEXT: vmv.v.x v9, a6 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a7 -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 2 -; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 2 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t0 ; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV32-ONLY-NEXT: vslidedown.vi v9, v9, 1 -; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 1 +; RV32-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV32-ONLY-NEXT: vslide1down.vx v9, v9, t1 ; RV32-ONLY-NEXT: vslide1down.vx v10, v8, a5 ; RV32-ONLY-NEXT: vslide1down.vx v8, v9, a0 @@ -2759,33 +2761,49 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; ; RV32VB-LABEL: buildvec_v16i8_loads_undef_scattered: ; RV32VB: # %bb.0: -; RV32VB-NEXT: lbu a1, 1(a0) -; RV32VB-NEXT: lbu a2, 0(a0) -; RV32VB-NEXT: lbu a3, 44(a0) -; RV32VB-NEXT: lbu a4, 55(a0) -; RV32VB-NEXT: slli a1, a1, 8 -; RV32VB-NEXT: or a1, a2, a1 +; RV32VB-NEXT: zext.b a1, a0 ; RV32VB-NEXT: lbu a2, 75(a0) -; RV32VB-NEXT: lbu a5, 82(a0) -; RV32VB-NEXT: lbu a6, 93(a0) -; RV32VB-NEXT: lbu a7, 124(a0) +; RV32VB-NEXT: lbu a3, 82(a0) +; RV32VB-NEXT: lbu a4, 93(a0) +; RV32VB-NEXT: lbu a5, 124(a0) +; RV32VB-NEXT: lbu a6, 144(a0) +; RV32VB-NEXT: lbu a7, 154(a0) +; RV32VB-NEXT: slli t0, a1, 16 +; RV32VB-NEXT: slli t1, a1, 8 +; RV32VB-NEXT: or a1, a1, t1 +; RV32VB-NEXT: slli a2, a2, 24 +; RV32VB-NEXT: zext.b a5, a5 +; RV32VB-NEXT: slli a6, a6, 16 +; RV32VB-NEXT: slli a7, a7, 24 +; RV32VB-NEXT: or a2, a2, t0 +; RV32VB-NEXT: or a6, a7, a6 +; RV32VB-NEXT: or a5, a5, t1 +; RV32VB-NEXT: lbu a7, 0(a0) +; RV32VB-NEXT: lbu t0, 1(a0) +; RV32VB-NEXT: lbu t1, 44(a0) +; RV32VB-NEXT: lbu a0, 55(a0) +; RV32VB-NEXT: slli a1, a1, 16 +; RV32VB-NEXT: zext.b a7, a7 +; RV32VB-NEXT: zext.b a3, a3 +; RV32VB-NEXT: or a7, a7, a1 +; RV32VB-NEXT: or a1, a3, a1 +; RV32VB-NEXT: zext.b a3, t0 +; RV32VB-NEXT: zext.b t0, t1 +; RV32VB-NEXT: zext.b a0, a0 +; RV32VB-NEXT: zext.b a4, a4 +; RV32VB-NEXT: slli a3, a3, 8 +; RV32VB-NEXT: slli a0, a0, 8 ; RV32VB-NEXT: slli a4, a4, 8 -; RV32VB-NEXT: or a3, a3, a4 -; RV32VB-NEXT: lbu a4, 144(a0) -; RV32VB-NEXT: lbu a0, 154(a0) -; RV32VB-NEXT: slli a6, a6, 8 +; RV32VB-NEXT: or a0, t0, a0 ; RV32VB-NEXT: or a5, a5, a6 -; RV32VB-NEXT: slli a4, a4, 16 -; RV32VB-NEXT: slli a0, a0, 24 -; RV32VB-NEXT: or a0, a0, a4 -; RV32VB-NEXT: slli a2, a2, 24 -; RV32VB-NEXT: or a2, a3, a2 -; RV32VB-NEXT: or a0, a7, a0 +; RV32VB-NEXT: or a3, a7, a3 +; RV32VB-NEXT: or a0, a0, a2 ; RV32VB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32VB-NEXT: vmv.v.x v8, a1 -; RV32VB-NEXT: vslide1down.vx v8, v8, a2 -; RV32VB-NEXT: vslide1down.vx v8, v8, a5 +; RV32VB-NEXT: vmv.v.x v8, a3 ; RV32VB-NEXT: vslide1down.vx v8, v8, a0 +; RV32VB-NEXT: or a1, a1, a4 +; RV32VB-NEXT: vslide1down.vx v8, v8, a1 +; RV32VB-NEXT: vslide1down.vx v8, v8, a5 ; RV32VB-NEXT: ret ; ; RV32VB-PACK-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2838,13 +2856,15 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64V-ONLY-NEXT: vmv.v.x v9, a6 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a7 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 2 -; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a3 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t0 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a4 -; RV64V-ONLY-NEXT: vslidedown.vi v9, v9, 1 -; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 1 +; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 ; RV64V-ONLY-NEXT: vslide1down.vx v9, v9, t1 ; RV64V-ONLY-NEXT: vslide1down.vx v10, v8, a5 ; RV64V-ONLY-NEXT: vslide1down.vx v8, v9, a0 @@ -2853,35 +2873,51 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; ; RVA22U64-LABEL: buildvec_v16i8_loads_undef_scattered: ; RVA22U64: # %bb.0: -; RVA22U64-NEXT: lbu a1, 1(a0) -; RVA22U64-NEXT: lbu a2, 0(a0) -; RVA22U64-NEXT: lbu a3, 44(a0) -; RVA22U64-NEXT: lbu a4, 55(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a6, a2, a1 +; RVA22U64-NEXT: zext.b a3, a0 ; RVA22U64-NEXT: lbu a7, 75(a0) -; RVA22U64-NEXT: lbu a5, 82(a0) -; RVA22U64-NEXT: lbu a1, 93(a0) -; RVA22U64-NEXT: lbu a2, 124(a0) -; RVA22U64-NEXT: slli a3, a3, 32 -; RVA22U64-NEXT: slli a4, a4, 40 -; RVA22U64-NEXT: or a3, a3, a4 +; RVA22U64-NEXT: lbu t0, 82(a0) +; RVA22U64-NEXT: lbu a6, 93(a0) +; RVA22U64-NEXT: lbu t1, 124(a0) ; RVA22U64-NEXT: lbu a4, 144(a0) -; RVA22U64-NEXT: lbu a0, 154(a0) -; RVA22U64-NEXT: slli a1, a1, 8 -; RVA22U64-NEXT: or a1, a1, a5 -; RVA22U64-NEXT: slli a4, a4, 48 -; RVA22U64-NEXT: slli a0, a0, 56 -; RVA22U64-NEXT: or a0, a0, a4 +; RVA22U64-NEXT: lbu a2, 154(a0) +; RVA22U64-NEXT: slli a5, a3, 48 +; RVA22U64-NEXT: slli a1, a3, 8 +; RVA22U64-NEXT: or t2, a3, a1 ; RVA22U64-NEXT: slli a7, a7, 56 -; RVA22U64-NEXT: or a3, a7, a3 +; RVA22U64-NEXT: zext.b a3, t1 +; RVA22U64-NEXT: slli a4, a4, 48 +; RVA22U64-NEXT: slli a2, a2, 56 +; RVA22U64-NEXT: or t1, a7, a5 +; RVA22U64-NEXT: or a7, a2, a4 +; RVA22U64-NEXT: or a1, a1, a3 +; RVA22U64-NEXT: lbu a3, 0(a0) +; RVA22U64-NEXT: lbu a4, 1(a0) +; RVA22U64-NEXT: lbu a2, 44(a0) +; RVA22U64-NEXT: lbu a0, 55(a0) +; RVA22U64-NEXT: slli t2, t2, 16 +; RVA22U64-NEXT: zext.b a3, a3 +; RVA22U64-NEXT: zext.b a5, t0 +; RVA22U64-NEXT: or a3, a3, t2 +; RVA22U64-NEXT: or a5, a5, t2 +; RVA22U64-NEXT: zext.b a2, a2 +; RVA22U64-NEXT: zext.b a0, a0 ; RVA22U64-NEXT: slli a2, a2, 32 +; RVA22U64-NEXT: slli a0, a0, 40 ; RVA22U64-NEXT: or a0, a0, a2 -; RVA22U64-NEXT: or a2, a6, a3 -; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: zext.b a2, a4 +; RVA22U64-NEXT: zext.b a4, a6 +; RVA22U64-NEXT: slli a2, a2, 8 +; RVA22U64-NEXT: slli a4, a4, 8 +; RVA22U64-NEXT: or a2, a2, a3 +; RVA22U64-NEXT: slli a1, a1, 32 +; RVA22U64-NEXT: or a4, a4, a5 +; RVA22U64-NEXT: or a0, t1, a0 +; RVA22U64-NEXT: or a1, a7, a1 +; RVA22U64-NEXT: or a0, a0, a2 +; RVA22U64-NEXT: or a1, a1, a4 ; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RVA22U64-NEXT: vmv.v.x v8, a2 -; RVA22U64-NEXT: vslide1down.vx v8, v8, a0 +; RVA22U64-NEXT: vmv.v.x v8, a0 +; RVA22U64-NEXT: vslide1down.vx v8, v8, a1 ; RVA22U64-NEXT: ret ; ; RVA22U64-PACK-LABEL: buildvec_v16i8_loads_undef_scattered: @@ -2934,13 +2970,15 @@ define <16 x i8> @buildvec_v16i8_loads_undef_scattered(ptr %p) { ; RV64ZVE32-NEXT: vmv.v.x v9, a6 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a7 -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 2 -; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a3 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t0 ; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a4 -; RV64ZVE32-NEXT: vslidedown.vi v9, v9, 1 -; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 1 +; RV64ZVE32-NEXT: vslide1down.vx v9, v9, a0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 ; RV64ZVE32-NEXT: vslide1down.vx v9, v9, t1 ; RV64ZVE32-NEXT: vslide1down.vx v10, v8, a5 ; RV64ZVE32-NEXT: vslide1down.vx v8, v9, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 0c30cbe4a42ef..45ceccaf8e83f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1343,23 +1343,24 @@ define void @mulhs_v2i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: vid.v v9 +; RV32-NEXT: lui a1, 349525 ; RV32-NEXT: addi a2, a1, 1365 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vmv.v.x v10, a2 ; RV32-NEXT: li a2, 63 -; RV32-NEXT: addi a1, a1, 1366 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vrsub.vi v11, v9, 0 +; RV32-NEXT: addi a1, a1, 1366 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, tu, ma ; RV32-NEXT: vmv.s.x v10, a1 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vmulh.vv v10, v8, v10 -; RV32-NEXT: vmadd.vv v11, v8, v10 -; RV32-NEXT: vsrl.vx v8, v11, a2 -; RV32-NEXT: vsra.vv v9, v11, v9 -; RV32-NEXT: vadd.vv v8, v9, v8 +; RV32-NEXT: vmul.vv v11, v8, v11 +; RV32-NEXT: vmulh.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v11 +; RV32-NEXT: vsrl.vx v10, v8, a2 +; RV32-NEXT: vsra.vv v8, v8, v9 +; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret ; @@ -1367,24 +1368,25 @@ define void @mulhs_v2i64(ptr %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vid.v v9 ; RV64-NEXT: lui a1, 349525 ; RV64-NEXT: addi a1, a1, 1365 ; RV64-NEXT: slli a2, a1, 32 ; RV64-NEXT: add a1, a1, a2 ; RV64-NEXT: lui a2, %hi(.LCPI74_0) -; RV64-NEXT: vid.v v9 ; RV64-NEXT: ld a2, %lo(.LCPI74_0)(a2) ; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: li a1, 63 ; RV64-NEXT: vrsub.vi v11, v9, 0 +; RV64-NEXT: vmul.vv v11, v8, v11 ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma ; RV64-NEXT: vmv.s.x v10, a2 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV64-NEXT: vmulh.vv v10, v8, v10 -; RV64-NEXT: vmadd.vv v11, v8, v10 -; RV64-NEXT: vsrl.vx v8, v11, a1 -; RV64-NEXT: vsra.vv v9, v11, v9 -; RV64-NEXT: vadd.vv v8, v9, v8 +; RV64-NEXT: vmulh.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v8, v11 +; RV64-NEXT: vsrl.vx v10, v8, a1 +; RV64-NEXT: vsra.vv v8, v8, v9 +; RV64-NEXT: vadd.vv v8, v8, v10 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret %a = load <2 x i64>, ptr %x @@ -3313,26 +3315,26 @@ define void @mulhu_v4i64(ptr %x) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: lui a1, 524288 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vmv.s.x v12, a1 ; RV32-NEXT: lui a1, %hi(.LCPI184_0) ; RV32-NEXT: addi a1, a1, %lo(.LCPI184_0) -; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vle32.v v10, (a1) -; RV32-NEXT: lui a1, 524288 -; RV32-NEXT: vmv.v.i v12, 0 -; RV32-NEXT: vmv.s.x v14, a1 +; RV32-NEXT: vle32.v v14, (a1) +; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma +; RV32-NEXT: vslideup.vi v10, v12, 5 ; RV32-NEXT: lui a1, %hi(.LCPI184_1) ; RV32-NEXT: addi a1, a1, %lo(.LCPI184_1) -; RV32-NEXT: vsetivli zero, 6, e32, m2, tu, ma -; RV32-NEXT: vslideup.vi v12, v14, 5 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vle8.v v14, (a1) +; RV32-NEXT: vle8.v v16, (a1) ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vmulhu.vv v10, v8, v10 -; RV32-NEXT: vsub.vv v8, v8, v10 -; RV32-NEXT: vmulhu.vv v8, v8, v12 -; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vmulhu.vv v12, v8, v14 +; RV32-NEXT: vsub.vv v8, v8, v12 +; RV32-NEXT: vmulhu.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vsext.vf4 v10, v14 +; RV32-NEXT: vsext.vf4 v10, v16 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vsrl.vv v8, v8, v10 ; RV32-NEXT: vse64.v v8, (a0) @@ -3349,17 +3351,17 @@ define void @mulhu_v4i64(ptr %x) { ; RV64-NEXT: vmv.v.i v12, 0 ; RV64-NEXT: slli a1, a1, 63 ; RV64-NEXT: vmv.s.x v14, a1 -; RV64-NEXT: lui a1, 12320 -; RV64-NEXT: addi a1, a1, 513 ; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma ; RV64-NEXT: vslideup.vi v12, v14, 2 -; RV64-NEXT: vmv.s.x v14, a1 +; RV64-NEXT: lui a1, 12320 +; RV64-NEXT: addi a1, a1, 513 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vmulhu.vv v10, v8, v10 ; RV64-NEXT: vsub.vv v8, v8, v10 ; RV64-NEXT: vmulhu.vv v8, v8, v12 +; RV64-NEXT: vmv.s.x v12, a1 ; RV64-NEXT: vadd.vv v8, v8, v10 -; RV64-NEXT: vsext.vf8 v10, v14 +; RV64-NEXT: vsext.vf8 v10, v12 ; RV64-NEXT: vsrl.vv v8, v8, v10 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret @@ -3483,14 +3485,15 @@ define void @mulhs_v4i64(ptr %x) { ; RV32-NEXT: vsext.vf4 v12, v14 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vmulh.vv v10, v8, v10 -; RV32-NEXT: vmadd.vv v12, v8, v10 +; RV32-NEXT: vmul.vv v8, v8, v12 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a1 +; RV32-NEXT: vmv.v.x v14, a1 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-NEXT: vsext.vf4 v8, v10 +; RV32-NEXT: vsext.vf4 v12, v14 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV32-NEXT: vsrl.vx v10, v12, a2 -; RV32-NEXT: vsra.vv v8, v12, v8 +; RV32-NEXT: vadd.vv v8, v10, v8 +; RV32-NEXT: vsrl.vx v10, v8, a2 +; RV32-NEXT: vsra.vv v8, v8, v12 ; RV32-NEXT: vadd.vv v8, v8, v10 ; RV32-NEXT: vse64.v v8, (a0) ; RV32-NEXT: ret @@ -3499,13 +3502,16 @@ define void @mulhs_v4i64(ptr %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV64-NEXT: vle64.v v8, (a0) -; RV64-NEXT: lui a1, 349525 -; RV64-NEXT: lui a2, 1044496 -; RV64-NEXT: addi a1, a1, 1365 -; RV64-NEXT: addi a2, a2, -256 -; RV64-NEXT: vmv.s.x v14, a2 -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: lui a1, 1044496 +; RV64-NEXT: lui a2, 349525 +; RV64-NEXT: addi a1, a1, -256 +; RV64-NEXT: vmv.s.x v14, a1 +; RV64-NEXT: lui a1, 4096 +; RV64-NEXT: addi a2, a2, 1365 +; RV64-NEXT: addi a1, a1, 256 +; RV64-NEXT: vmv.s.x v15, a1 +; RV64-NEXT: slli a1, a2, 32 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: lui a2, %hi(.LCPI188_0) ; RV64-NEXT: ld a2, %lo(.LCPI188_0)(a2) ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma @@ -3514,16 +3520,14 @@ define void @mulhs_v4i64(ptr %x) { ; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: li a1, 63 ; RV64-NEXT: vmerge.vxm v10, v10, a2, v0 -; RV64-NEXT: lui a2, 4096 -; RV64-NEXT: addi a2, a2, 256 ; RV64-NEXT: vsext.vf8 v12, v14 -; RV64-NEXT: vmulh.vv v10, v8, v10 -; RV64-NEXT: vmadd.vv v12, v8, v10 -; RV64-NEXT: vmv.s.x v10, a2 -; RV64-NEXT: vsext.vf8 v8, v10 -; RV64-NEXT: vsrl.vx v10, v12, a1 -; RV64-NEXT: vsra.vv v8, v12, v8 -; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vmul.vv v12, v8, v12 +; RV64-NEXT: vmulh.vv v8, v8, v10 +; RV64-NEXT: vsext.vf8 v10, v15 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vsrl.vx v12, v8, a1 +; RV64-NEXT: vsra.vv v8, v8, v10 +; RV64-NEXT: vadd.vv v8, v8, v12 ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: ret %a = load <4 x i64>, ptr %x @@ -5653,8 +5657,9 @@ define void @madd_vv_v2i64(ptr %x, <2 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vmadd.vv v9, v8, v8 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vmul.vv v9, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %x %b = add <2 x i64> %a, @@ -5668,8 +5673,9 @@ define void @madd_vv_v2i64_2(ptr %x, <2 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vmadd.vv v9, v8, v8 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vmul.vv v9, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %x %b = add <2 x i64> %a, @@ -5683,8 +5689,9 @@ define void @msub_vv_v2i64(ptr %x, <2 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vnmsub.vv v9, v8, v8 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vmul.vv v9, v8, v9 +; CHECK-NEXT: vsub.vv v8, v8, v9 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %x %b = sub <2 x i64> , %a @@ -5698,8 +5705,9 @@ define void @msub_vv_v2i64_2(ptr %x, <2 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle64.v v9, (a0) -; CHECK-NEXT: vnmsub.vv v9, v8, v8 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vmul.vv v9, v8, v9 +; CHECK-NEXT: vsub.vv v8, v8, v9 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %x %b = sub <2 x i64> , %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-logic.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-logic.ll index 00f0346fae143..519129441720d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-logic.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-logic.ll @@ -75,7 +75,8 @@ define void @andnot_v8i1(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vlm.v v9, (a1) -; CHECK-NEXT: vmandn.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 ; CHECK-NEXT: vsm.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x i1>, ptr %x @@ -92,7 +93,8 @@ define void @ornot_v16i1(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vlm.v v9, (a1) -; CHECK-NEXT: vmorn.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v8, v9, v8 ; CHECK-NEXT: vsm.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i1>, ptr %x @@ -110,7 +112,8 @@ define void @xornot_v32i1(ptr %x, ptr %y) { ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vlm.v v9, (a1) -; CHECK-NEXT: vmxnor.mm v8, v8, v9 +; CHECK-NEXT: vmxor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a0) ; CHECK-NEXT: ret %a = load <32 x i1>, ptr %x @@ -127,7 +130,8 @@ define void @nand_v8i1(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vlm.v v9, (a1) -; CHECK-NEXT: vmnand.mm v8, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x i1>, ptr %x @@ -144,7 +148,8 @@ define void @nor_v16i1(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vlm.v v9, (a1) -; CHECK-NEXT: vmnor.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i1>, ptr %x @@ -162,7 +167,8 @@ define void @xnor_v32i1(ptr %x, ptr %y) { ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vlm.v v8, (a0) ; CHECK-NEXT: vlm.v v9, (a1) -; CHECK-NEXT: vmxnor.mm v8, v8, v9 +; CHECK-NEXT: vmxor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vsm.v v8, (a0) ; CHECK-NEXT: ret %a = load <32 x i1>, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 533b8b6864ebc..0af3c8cda88d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -1661,19 +1661,19 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i16> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v8, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV32-NEXT: vluxei16.v v9, (a0), v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v10, v10 +; RV32-NEXT: vluxei16.v v9, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i16: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwaddu.vv v10, v8, v8 -; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64V-NEXT: vluxei16.v v9, (a0), v10, v0.t +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64V-NEXT: vzext.vf2 v10, v8 +; RV64V-NEXT: vadd.vv v8, v10, v10 +; RV64V-NEXT: vluxei16.v v9, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v9 ; RV64V-NEXT: ret ; @@ -1809,8 +1809,10 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i16> %passthru) { ; RV32-LABEL: mgather_baseidx_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV32-NEXT: vwadd.vv v10, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret @@ -2676,21 +2678,21 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i32> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v9, v8, a1 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vsll.vi v8, v9, 2 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei16.v v10, (a0), v9, v0.t +; RV32-NEXT: vluxei16.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v9, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v9, v8 +; RV64V-NEXT: vsll.vi v8, v9, 2 ; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64V-NEXT: vluxei16.v v10, (a0), v9, v0.t +; RV64V-NEXT: vluxei16.v v10, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret ; @@ -2826,11 +2828,10 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) { ; RV32-LABEL: mgather_baseidx_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vsext.vf2 v12, v8 +; RV32-NEXT: vsll.vi v8, v12, 2 +; RV32-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; @@ -2968,11 +2969,10 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) { ; RV32-LABEL: mgather_baseidx_sext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vsext.vf2 v12, v8 +; RV32-NEXT: vsll.vi v8, v12, 2 +; RV32-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; @@ -3111,21 +3111,19 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, < define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i32> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vzext.vf2 v12, v8 +; RV32-NEXT: vsll.vi v8, v12, 2 +; RV32-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8i32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v12, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64V-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64V-NEXT: vzext.vf2 v12, v8 +; RV64V-NEXT: vsll.vi v8, v12, 2 +; RV64V-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret ; @@ -3270,9 +3268,10 @@ define <8 x i32> @mgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m, ; ; RV64V-LABEL: mgather_baseidx_v8i32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64V-NEXT: vwmulsu.vx v12, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t ; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret @@ -4555,21 +4554,21 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x i64> %passthru) { ; RV32V-LABEL: mgather_baseidx_zext_v8i8_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32V-NEXT: vwmulu.vx v9, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32V-NEXT: vzext.vf2 v9, v8 +; RV32V-NEXT: vsll.vi v8, v9, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei16.v v12, (a0), v9, v0.t +; RV32V-NEXT: vluxei16.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v9, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v9, v8 +; RV64V-NEXT: vsll.vi v8, v9, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei16.v v12, (a0), v9, v0.t +; RV64V-NEXT: vluxei16.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -4844,11 +4843,11 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) { ; RV32V-LABEL: mgather_baseidx_v8i16_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v10, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; @@ -4864,17 +4863,16 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-LABEL: mgather_baseidx_v8i16_v8i64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: andi a3, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB51_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a3, v10 +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) ; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 @@ -4910,40 +4908,40 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: beqz a4, .LBB51_2 ; RV32ZVE32F-NEXT: .LBB51_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 ; RV32ZVE32F-NEXT: lw a4, 0(a5) ; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB51_3 ; RV32ZVE32F-NEXT: .LBB51_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 ; RV32ZVE32F-NEXT: lw a6, 0(a7) ; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB51_4 ; RV32ZVE32F-NEXT: .LBB51_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s t2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 0(t2) ; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB51_5 ; RV32ZVE32F-NEXT: .LBB51_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s t4, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 0(t4) ; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB51_6 ; RV32ZVE32F-NEXT: .LBB51_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s t6, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 ; RV32ZVE32F-NEXT: lw t5, 0(t6) ; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB51_13: # %else14 @@ -4957,8 +4955,8 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: beqz s0, .LBB51_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s s1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 0(s1) ; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 @@ -4974,7 +4972,7 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i ; RV32ZVE32F-NEXT: beqz t0, .LBB51_15 ; RV32ZVE32F-NEXT: .LBB51_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw t0, 0(a2) ; RV32ZVE32F-NEXT: lw a2, 4(a2) @@ -5125,11 +5123,11 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) { ; RV32V-LABEL: mgather_baseidx_sext_v8i16_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v10, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; @@ -5145,17 +5143,16 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8i64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: andi a3, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB52_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a3, v10 +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) ; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 @@ -5191,40 +5188,40 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: beqz a4, .LBB52_2 ; RV32ZVE32F-NEXT: .LBB52_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 ; RV32ZVE32F-NEXT: lw a4, 0(a5) ; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB52_3 ; RV32ZVE32F-NEXT: .LBB52_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 ; RV32ZVE32F-NEXT: lw a6, 0(a7) ; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB52_4 ; RV32ZVE32F-NEXT: .LBB52_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s t2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 0(t2) ; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB52_5 ; RV32ZVE32F-NEXT: .LBB52_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s t4, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 0(t4) ; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB52_6 ; RV32ZVE32F-NEXT: .LBB52_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s t6, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 ; RV32ZVE32F-NEXT: lw t5, 0(t6) ; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB52_13: # %else14 @@ -5238,8 +5235,8 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: beqz s0, .LBB52_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s s1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 0(s1) ; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 @@ -5255,7 +5252,7 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: beqz t0, .LBB52_15 ; RV32ZVE32F-NEXT: .LBB52_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw t0, 0(a2) ; RV32ZVE32F-NEXT: lw a2, 4(a2) @@ -5407,38 +5404,37 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x i64> %passthru) { ; RV32V-LABEL: mgather_baseidx_zext_v8i16_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulu.vx v10, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vzext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v10, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-NEXT: vzext.vf2 v10, v8 +; RV64V-NEXT: vsll.vi v8, v10, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV64V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8i64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: andi a3, t0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccu.vx v10, a1, v8 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: beqz a3, .LBB53_7 ; RV32ZVE32F-NEXT: # %bb.1: # %cond.load -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a3, v10 +; RV32ZVE32F-NEXT: vmv.x.s a3, v8 ; RV32ZVE32F-NEXT: lw a1, 0(a3) ; RV32ZVE32F-NEXT: lw a3, 4(a3) ; RV32ZVE32F-NEXT: andi a4, t0, 2 @@ -5474,40 +5470,40 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: beqz a4, .LBB53_2 ; RV32ZVE32F-NEXT: .LBB53_8: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a5, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a5, v10 ; RV32ZVE32F-NEXT: lw a4, 0(a5) ; RV32ZVE32F-NEXT: lw a5, 4(a5) ; RV32ZVE32F-NEXT: andi a6, t0, 4 ; RV32ZVE32F-NEXT: beqz a6, .LBB53_3 ; RV32ZVE32F-NEXT: .LBB53_9: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a7, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a7, v10 ; RV32ZVE32F-NEXT: lw a6, 0(a7) ; RV32ZVE32F-NEXT: lw a7, 4(a7) ; RV32ZVE32F-NEXT: andi t1, t0, 8 ; RV32ZVE32F-NEXT: beqz t1, .LBB53_4 ; RV32ZVE32F-NEXT: .LBB53_10: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s t2, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s t2, v10 ; RV32ZVE32F-NEXT: lw t1, 0(t2) ; RV32ZVE32F-NEXT: lw t2, 4(t2) ; RV32ZVE32F-NEXT: andi t3, t0, 16 ; RV32ZVE32F-NEXT: beqz t3, .LBB53_5 ; RV32ZVE32F-NEXT: .LBB53_11: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s t4, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s t4, v10 ; RV32ZVE32F-NEXT: lw t3, 0(t4) ; RV32ZVE32F-NEXT: lw t4, 4(t4) ; RV32ZVE32F-NEXT: andi t5, t0, 32 ; RV32ZVE32F-NEXT: beqz t5, .LBB53_6 ; RV32ZVE32F-NEXT: .LBB53_12: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s t6, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s t6, v10 ; RV32ZVE32F-NEXT: lw t5, 0(t6) ; RV32ZVE32F-NEXT: lw t6, 4(t6) ; RV32ZVE32F-NEXT: .LBB53_13: # %else14 @@ -5521,8 +5517,8 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: beqz s0, .LBB53_16 ; RV32ZVE32F-NEXT: # %bb.14: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s s1, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s s1, v10 ; RV32ZVE32F-NEXT: lw s0, 0(s1) ; RV32ZVE32F-NEXT: lw s1, 4(s1) ; RV32ZVE32F-NEXT: andi t0, t0, -128 @@ -5538,7 +5534,7 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, < ; RV32ZVE32F-NEXT: beqz t0, .LBB53_15 ; RV32ZVE32F-NEXT: .LBB53_17: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a2, v8 ; RV32ZVE32F-NEXT: lw t0, 0(a2) ; RV32ZVE32F-NEXT: lw a2, 4(a2) @@ -5707,11 +5703,10 @@ define <8 x i64> @mgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x i ; ; RV64V-LABEL: mgather_baseidx_v8i32_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -5988,11 +5983,10 @@ define <8 x i64> @mgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; ; RV64V-LABEL: mgather_baseidx_sext_v8i32_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -6270,11 +6264,10 @@ define <8 x i64> @mgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, < ; ; RV64V-LABEL: mgather_baseidx_zext_v8i32_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulu.vx v16, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -7442,19 +7435,19 @@ define <8 x bfloat> @mgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x bfloat> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8bf16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v8, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV32-NEXT: vluxei16.v v9, (a0), v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v10, v10 +; RV32-NEXT: vluxei16.v v9, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8bf16: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwaddu.vv v10, v8, v8 -; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64V-NEXT: vluxei16.v v9, (a0), v10, v0.t +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64V-NEXT: vzext.vf2 v10, v8 +; RV64V-NEXT: vadd.vv v8, v10, v10 +; RV64V-NEXT: vluxei16.v v9, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v9 ; RV64V-NEXT: ret ; @@ -7590,8 +7583,10 @@ define <8 x bfloat> @mgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, define <8 x bfloat> @mgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x bfloat> %passthru) { ; RV32-LABEL: mgather_baseidx_v8bf16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV32-NEXT: vwadd.vv v10, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret @@ -8748,19 +8743,19 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x half> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v8, v8 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV32-NEXT: vluxei16.v v9, (a0), v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v10, v10 +; RV32-NEXT: vluxei16.v v9, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f16: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwaddu.vv v10, v8, v8 -; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, mu -; RV64V-NEXT: vluxei16.v v9, (a0), v10, v0.t +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; RV64V-NEXT: vzext.vf2 v10, v8 +; RV64V-NEXT: vadd.vv v8, v10, v10 +; RV64V-NEXT: vluxei16.v v9, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v9 ; RV64V-NEXT: ret ; @@ -9020,8 +9015,10 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x half> %passthru) { ; RV32-LABEL: mgather_baseidx_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; RV32-NEXT: vwadd.vv v10, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; RV32-NEXT: vluxei32.v v9, (a0), v10, v0.t ; RV32-NEXT: vmv.v.v v8, v9 ; RV32-NEXT: ret @@ -9868,21 +9865,21 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x float> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i8_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v9, v8, a1 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vsll.vi v8, v9, 2 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei16.v v10, (a0), v9, v0.t +; RV32-NEXT: vluxei16.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v9, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v9, v8 +; RV64V-NEXT: vsll.vi v8, v9, 2 ; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64V-NEXT: vluxei16.v v10, (a0), v9, v0.t +; RV64V-NEXT: vluxei16.v v10, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret ; @@ -10018,11 +10015,10 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, < define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) { ; RV32-LABEL: mgather_baseidx_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vsext.vf2 v12, v8 +; RV32-NEXT: vsll.vi v8, v12, 2 +; RV32-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; @@ -10160,11 +10156,10 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) { ; RV32-LABEL: mgather_baseidx_sext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vsext.vf2 v12, v8 +; RV32-NEXT: vsll.vi v8, v12, 2 +; RV32-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; @@ -10303,21 +10298,19 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x float> %passthru) { ; RV32-LABEL: mgather_baseidx_zext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v8, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV32-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV32-NEXT: vzext.vf2 v12, v8 +; RV32-NEXT: vsll.vi v8, v12, 2 +; RV32-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8f32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v12, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu -; RV64V-NEXT: vluxei32.v v10, (a0), v12, v0.t +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; RV64V-NEXT: vzext.vf2 v12, v8 +; RV64V-NEXT: vsll.vi v8, v12, 2 +; RV64V-NEXT: vluxei32.v v10, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret ; @@ -10462,9 +10455,10 @@ define <8 x float> @mgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> % ; ; RV64V-LABEL: mgather_baseidx_v8f32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; RV64V-NEXT: vwmulsu.vx v12, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v12, v8 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, mu ; RV64V-NEXT: vluxei64.v v10, (a0), v12, v0.t ; RV64V-NEXT: vmv.v.v v8, v10 ; RV64V-NEXT: ret @@ -11497,21 +11491,21 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8 x double> %passthru) { ; RV32V-LABEL: mgather_baseidx_zext_v8i8_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32V-NEXT: vwmulu.vx v9, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32V-NEXT: vzext.vf2 v9, v8 +; RV32V-NEXT: vsll.vi v8, v9, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei16.v v12, (a0), v9, v0.t +; RV32V-NEXT: vluxei16.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i8_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v9, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v9, v8 +; RV64V-NEXT: vsll.vi v8, v9, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei16.v v12, (a0), v9, v0.t +; RV64V-NEXT: vluxei16.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -11717,11 +11711,11 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) { ; RV32V-LABEL: mgather_baseidx_v8i16_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v10, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; @@ -11737,38 +11731,38 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV32ZVE32F-LABEL: mgather_baseidx_v8i16_v8f64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li a2, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a3, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, a2, v8 +; RV32ZVE32F-NEXT: vmv.x.s a2, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB100_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a2, a1, 2 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_11 +; RV32ZVE32F-NEXT: andi a1, a2, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB100_11 ; RV32ZVE32F-NEXT: .LBB100_2: # %else2 -; RV32ZVE32F-NEXT: andi a2, a1, 4 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_12 +; RV32ZVE32F-NEXT: andi a1, a2, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB100_12 ; RV32ZVE32F-NEXT: .LBB100_3: # %else5 -; RV32ZVE32F-NEXT: andi a2, a1, 8 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_13 +; RV32ZVE32F-NEXT: andi a1, a2, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB100_13 ; RV32ZVE32F-NEXT: .LBB100_4: # %else8 -; RV32ZVE32F-NEXT: andi a2, a1, 16 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_14 +; RV32ZVE32F-NEXT: andi a1, a2, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB100_14 ; RV32ZVE32F-NEXT: .LBB100_5: # %else11 -; RV32ZVE32F-NEXT: andi a2, a1, 32 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_15 +; RV32ZVE32F-NEXT: andi a1, a2, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB100_15 ; RV32ZVE32F-NEXT: .LBB100_6: # %else14 -; RV32ZVE32F-NEXT: andi a2, a1, 64 -; RV32ZVE32F-NEXT: bnez a2, .LBB100_16 +; RV32ZVE32F-NEXT: andi a1, a2, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB100_16 ; RV32ZVE32F-NEXT: .LBB100_7: # %else17 -; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: andi a1, a2, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB100_9 ; RV32ZVE32F-NEXT: .LBB100_8: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB100_9: # %else20 @@ -11782,52 +11776,51 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB100_10: # %cond.load -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a2, v10 -; RV32ZVE32F-NEXT: fld fa0, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 2 -; RV32ZVE32F-NEXT: beqz a2, .LBB100_2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB100_2 ; RV32ZVE32F-NEXT: .LBB100_11: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa1, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 4 -; RV32ZVE32F-NEXT: beqz a2, .LBB100_3 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB100_3 ; RV32ZVE32F-NEXT: .LBB100_12: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa2, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 8 -; RV32ZVE32F-NEXT: beqz a2, .LBB100_4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB100_4 ; RV32ZVE32F-NEXT: .LBB100_13: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa3, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 16 -; RV32ZVE32F-NEXT: beqz a2, .LBB100_5 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB100_5 ; RV32ZVE32F-NEXT: .LBB100_14: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa4, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 32 -; RV32ZVE32F-NEXT: beqz a2, .LBB100_6 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB100_6 ; RV32ZVE32F-NEXT: .LBB100_15: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa5, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 64 -; RV32ZVE32F-NEXT: beqz a2, .LBB100_7 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB100_7 ; RV32ZVE32F-NEXT: .LBB100_16: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa6, 0(a2) -; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, -128 ; RV32ZVE32F-NEXT: bnez a1, .LBB100_8 ; RV32ZVE32F-NEXT: j .LBB100_9 ; @@ -11929,11 +11922,11 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) { ; RV32V-LABEL: mgather_baseidx_sext_v8i16_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v10, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; @@ -11949,38 +11942,38 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F-LABEL: mgather_baseidx_sext_v8i16_v8f64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li a2, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a3, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, a2, v8 +; RV32ZVE32F-NEXT: vmv.x.s a2, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB101_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a2, a1, 2 -; RV32ZVE32F-NEXT: bnez a2, .LBB101_11 +; RV32ZVE32F-NEXT: andi a1, a2, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB101_11 ; RV32ZVE32F-NEXT: .LBB101_2: # %else2 -; RV32ZVE32F-NEXT: andi a2, a1, 4 -; RV32ZVE32F-NEXT: bnez a2, .LBB101_12 +; RV32ZVE32F-NEXT: andi a1, a2, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB101_12 ; RV32ZVE32F-NEXT: .LBB101_3: # %else5 -; RV32ZVE32F-NEXT: andi a2, a1, 8 -; RV32ZVE32F-NEXT: bnez a2, .LBB101_13 +; RV32ZVE32F-NEXT: andi a1, a2, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB101_13 ; RV32ZVE32F-NEXT: .LBB101_4: # %else8 -; RV32ZVE32F-NEXT: andi a2, a1, 16 -; RV32ZVE32F-NEXT: bnez a2, .LBB101_14 +; RV32ZVE32F-NEXT: andi a1, a2, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB101_14 ; RV32ZVE32F-NEXT: .LBB101_5: # %else11 -; RV32ZVE32F-NEXT: andi a2, a1, 32 -; RV32ZVE32F-NEXT: bnez a2, .LBB101_15 +; RV32ZVE32F-NEXT: andi a1, a2, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB101_15 ; RV32ZVE32F-NEXT: .LBB101_6: # %else14 -; RV32ZVE32F-NEXT: andi a2, a1, 64 -; RV32ZVE32F-NEXT: bnez a2, .LBB101_16 +; RV32ZVE32F-NEXT: andi a1, a2, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB101_16 ; RV32ZVE32F-NEXT: .LBB101_7: # %else17 -; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: andi a1, a2, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB101_9 ; RV32ZVE32F-NEXT: .LBB101_8: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB101_9: # %else20 @@ -11994,52 +11987,51 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB101_10: # %cond.load -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a2, v10 -; RV32ZVE32F-NEXT: fld fa0, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 2 -; RV32ZVE32F-NEXT: beqz a2, .LBB101_2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB101_2 ; RV32ZVE32F-NEXT: .LBB101_11: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa1, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 4 -; RV32ZVE32F-NEXT: beqz a2, .LBB101_3 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB101_3 ; RV32ZVE32F-NEXT: .LBB101_12: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa2, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 8 -; RV32ZVE32F-NEXT: beqz a2, .LBB101_4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB101_4 ; RV32ZVE32F-NEXT: .LBB101_13: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa3, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 16 -; RV32ZVE32F-NEXT: beqz a2, .LBB101_5 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB101_5 ; RV32ZVE32F-NEXT: .LBB101_14: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa4, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 32 -; RV32ZVE32F-NEXT: beqz a2, .LBB101_6 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB101_6 ; RV32ZVE32F-NEXT: .LBB101_15: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa5, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 64 -; RV32ZVE32F-NEXT: beqz a2, .LBB101_7 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB101_7 ; RV32ZVE32F-NEXT: .LBB101_16: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa6, 0(a2) -; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, -128 ; RV32ZVE32F-NEXT: bnez a1, .LBB101_8 ; RV32ZVE32F-NEXT: j .LBB101_9 ; @@ -12142,59 +12134,59 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, <8 x double> %passthru) { ; RV32V-LABEL: mgather_baseidx_zext_v8i16_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulu.vx v10, v8, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vzext.vf2 v10, v8 +; RV32V-NEXT: vsll.vi v8, v10, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV32V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV32V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV32V-NEXT: vmv.v.v v8, v12 ; RV32V-NEXT: ret ; ; RV64V-LABEL: mgather_baseidx_zext_v8i16_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v10, v8, a1 +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-NEXT: vzext.vf2 v10, v8 +; RV64V-NEXT: vsll.vi v8, v10, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei32.v v12, (a0), v10, v0.t +; RV64V-NEXT: vluxei32.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mgather_baseidx_zext_v8i16_v8f64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li a2, 8 +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi a3, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccu.vx v10, a2, v8 +; RV32ZVE32F-NEXT: vmv.x.s a2, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a3, a2, 1 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez a3, .LBB102_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a2, a1, 2 -; RV32ZVE32F-NEXT: bnez a2, .LBB102_11 +; RV32ZVE32F-NEXT: andi a1, a2, 2 +; RV32ZVE32F-NEXT: bnez a1, .LBB102_11 ; RV32ZVE32F-NEXT: .LBB102_2: # %else2 -; RV32ZVE32F-NEXT: andi a2, a1, 4 -; RV32ZVE32F-NEXT: bnez a2, .LBB102_12 +; RV32ZVE32F-NEXT: andi a1, a2, 4 +; RV32ZVE32F-NEXT: bnez a1, .LBB102_12 ; RV32ZVE32F-NEXT: .LBB102_3: # %else5 -; RV32ZVE32F-NEXT: andi a2, a1, 8 -; RV32ZVE32F-NEXT: bnez a2, .LBB102_13 +; RV32ZVE32F-NEXT: andi a1, a2, 8 +; RV32ZVE32F-NEXT: bnez a1, .LBB102_13 ; RV32ZVE32F-NEXT: .LBB102_4: # %else8 -; RV32ZVE32F-NEXT: andi a2, a1, 16 -; RV32ZVE32F-NEXT: bnez a2, .LBB102_14 +; RV32ZVE32F-NEXT: andi a1, a2, 16 +; RV32ZVE32F-NEXT: bnez a1, .LBB102_14 ; RV32ZVE32F-NEXT: .LBB102_5: # %else11 -; RV32ZVE32F-NEXT: andi a2, a1, 32 -; RV32ZVE32F-NEXT: bnez a2, .LBB102_15 +; RV32ZVE32F-NEXT: andi a1, a2, 32 +; RV32ZVE32F-NEXT: bnez a1, .LBB102_15 ; RV32ZVE32F-NEXT: .LBB102_6: # %else14 -; RV32ZVE32F-NEXT: andi a2, a1, 64 -; RV32ZVE32F-NEXT: bnez a2, .LBB102_16 +; RV32ZVE32F-NEXT: andi a1, a2, 64 +; RV32ZVE32F-NEXT: bnez a1, .LBB102_16 ; RV32ZVE32F-NEXT: .LBB102_7: # %else17 -; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: andi a1, a2, -128 ; RV32ZVE32F-NEXT: beqz a1, .LBB102_9 ; RV32ZVE32F-NEXT: .LBB102_8: # %cond.load19 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a1, v8 ; RV32ZVE32F-NEXT: fld fa7, 0(a1) ; RV32ZVE32F-NEXT: .LBB102_9: # %else20 @@ -12208,52 +12200,51 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs ; RV32ZVE32F-NEXT: fsd fa7, 56(a0) ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB102_10: # %cond.load -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a2, v10 -; RV32ZVE32F-NEXT: fld fa0, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 2 -; RV32ZVE32F-NEXT: beqz a2, .LBB102_2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v8 +; RV32ZVE32F-NEXT: fld fa0, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 2 +; RV32ZVE32F-NEXT: beqz a1, .LBB102_2 ; RV32ZVE32F-NEXT: .LBB102_11: # %cond.load1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa1, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 4 -; RV32ZVE32F-NEXT: beqz a2, .LBB102_3 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa1, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 4 +; RV32ZVE32F-NEXT: beqz a1, .LBB102_3 ; RV32ZVE32F-NEXT: .LBB102_12: # %cond.load4 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa2, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 8 -; RV32ZVE32F-NEXT: beqz a2, .LBB102_4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa2, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 8 +; RV32ZVE32F-NEXT: beqz a1, .LBB102_4 ; RV32ZVE32F-NEXT: .LBB102_13: # %cond.load7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa3, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 16 -; RV32ZVE32F-NEXT: beqz a2, .LBB102_5 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa3, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 16 +; RV32ZVE32F-NEXT: beqz a1, .LBB102_5 ; RV32ZVE32F-NEXT: .LBB102_14: # %cond.load10 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa4, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 32 -; RV32ZVE32F-NEXT: beqz a2, .LBB102_6 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa4, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 32 +; RV32ZVE32F-NEXT: beqz a1, .LBB102_6 ; RV32ZVE32F-NEXT: .LBB102_15: # %cond.load13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa5, 0(a2) -; RV32ZVE32F-NEXT: andi a2, a1, 64 -; RV32ZVE32F-NEXT: beqz a2, .LBB102_7 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa5, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, 64 +; RV32ZVE32F-NEXT: beqz a1, .LBB102_7 ; RV32ZVE32F-NEXT: .LBB102_16: # %cond.load16 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a2, v8 -; RV32ZVE32F-NEXT: fld fa6, 0(a2) -; RV32ZVE32F-NEXT: andi a1, a1, -128 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a1, v10 +; RV32ZVE32F-NEXT: fld fa6, 0(a1) +; RV32ZVE32F-NEXT: andi a1, a2, -128 ; RV32ZVE32F-NEXT: bnez a1, .LBB102_8 ; RV32ZVE32F-NEXT: j .LBB102_9 ; @@ -12373,11 +12364,10 @@ define <8 x double> @mgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; ; RV64V-LABEL: mgather_baseidx_v8i32_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -12590,11 +12580,10 @@ define <8 x double> @mgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; ; RV64V-LABEL: mgather_baseidx_sext_v8i32_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vsext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; @@ -12808,11 +12797,10 @@ define <8 x double> @mgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idxs ; ; RV64V-LABEL: mgather_baseidx_zext_v8i32_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulu.vx v16, v8, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; RV64V-NEXT: vluxei64.v v12, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64V-NEXT: vzext.vf2 v16, v8 +; RV64V-NEXT: vsll.vi v8, v16, 3 +; RV64V-NEXT: vluxei64.v v12, (a0), v8, v0.t ; RV64V-NEXT: vmv.v.v v8, v12 ; RV64V-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll index e86fae6d501e5..c6addc8bc1591 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -1238,18 +1238,18 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v9, v9 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v9, v10, v10 +; RV32-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8i16: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwaddu.vv v10, v9, v9 -; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v10, v9 +; RV64V-NEXT: vadd.vv v9, v10, v10 +; RV64V-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i16: @@ -1371,8 +1371,10 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8 define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; @@ -2097,20 +2099,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v11, v10, a1 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v11, v10 +; RV32-NEXT: vsll.vi v10, v11, 2 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8i32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v11, v10, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v11, v10 +; RV64V-NEXT: vsll.vi v10, v11, 2 ; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV64V-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i32: @@ -2236,11 +2238,10 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8 define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_v8i16_v8i32: @@ -2367,11 +2368,10 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> % define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8i32: @@ -2499,20 +2499,18 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v10, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8i32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v12, v10, a1 -; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-NEXT: vzext.vf2 v12, v10 +; RV64V-NEXT: vsll.vi v10, v12, 2 +; RV64V-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i32: @@ -2646,9 +2644,10 @@ define void @mscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs, ; ; RV64V-LABEL: mscatter_baseidx_v8i32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v12, v10, a1 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v12, v10 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; @@ -3789,20 +3788,20 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_zext_v8i8_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32V-NEXT: vwmulu.vx v13, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32V-NEXT: vzext.vf2 v13, v12 +; RV32V-NEXT: vsll.vi v12, v13, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV32V-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v13, v12, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v13, v12 +; RV64V-NEXT: vsll.vi v12, v13, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV64V-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8i64: @@ -4044,11 +4043,11 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8 define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_v8i16_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v14, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_v8i16_v8i64: @@ -4076,47 +4075,47 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV32ZVE32F-NEXT: lw a7, 44(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw t2, 24(a0) -; RV32ZVE32F-NEXT: lw t3, 28(a0) -; RV32ZVE32F-NEXT: lw t0, 32(a0) -; RV32ZVE32F-NEXT: lw t1, 36(a0) -; RV32ZVE32F-NEXT: lw t6, 8(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) -; RV32ZVE32F-NEXT: lw t4, 16(a0) -; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 28(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 36(a0) +; RV32ZVE32F-NEXT: lw s0, 8(a0) +; RV32ZVE32F-NEXT: lw s1, 12(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw t6, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li s1, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, s1, v8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi s2, t0, 1 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez s2, .LBB45_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_11 ; RV32ZVE32F-NEXT: .LBB45_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: andi a0, t0, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_12 ; RV32ZVE32F-NEXT: .LBB45_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: andi a0, t0, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_13 ; RV32ZVE32F-NEXT: .LBB45_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: andi a0, t0, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_14 ; RV32ZVE32F-NEXT: .LBB45_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: andi a0, t0, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_15 ; RV32ZVE32F-NEXT: .LBB45_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: andi a0, t0, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_16 ; RV32ZVE32F-NEXT: .LBB45_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: andi a0, t0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_9 ; RV32ZVE32F-NEXT: .LBB45_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) @@ -4132,61 +4131,60 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB45_10: # %cond.store ; RV32ZVE32F-NEXT: .cfi_restore_state -; RV32ZVE32F-NEXT: lw s1, 0(a0) +; RV32ZVE32F-NEXT: lw a1, 0(a0) ; RV32ZVE32F-NEXT: lw a0, 4(a0) -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s s2, v10 -; RV32ZVE32F-NEXT: sw s1, 0(s2) +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw a1, 0(s2) ; RV32ZVE32F-NEXT: sw a0, 4(s2) -; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_2 ; RV32ZVE32F-NEXT: .LBB45_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: sw s0, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 0(a0) +; RV32ZVE32F-NEXT: sw s1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_3 ; RV32ZVE32F-NEXT: .LBB45_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t4, 0(a0) -; RV32ZVE32F-NEXT: sw t5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_4 ; RV32ZVE32F-NEXT: .LBB45_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t2, 0(a0) -; RV32ZVE32F-NEXT: sw t3, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_5 ; RV32ZVE32F-NEXT: .LBB45_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t0, 0(a0) -; RV32ZVE32F-NEXT: sw t1, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_6 ; RV32ZVE32F-NEXT: .LBB45_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a6, 0(a0) ; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: andi a0, t0, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB45_7 ; RV32ZVE32F-NEXT: .LBB45_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: andi a0, t0, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB45_8 ; RV32ZVE32F-NEXT: j .LBB45_9 ; @@ -4291,11 +4289,11 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> % define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_sext_v8i16_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v14, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8i64: @@ -4323,47 +4321,47 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: lw a7, 44(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw t2, 24(a0) -; RV32ZVE32F-NEXT: lw t3, 28(a0) -; RV32ZVE32F-NEXT: lw t0, 32(a0) -; RV32ZVE32F-NEXT: lw t1, 36(a0) -; RV32ZVE32F-NEXT: lw t6, 8(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) -; RV32ZVE32F-NEXT: lw t4, 16(a0) -; RV32ZVE32F-NEXT: lw t5, 20(a0) +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 28(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 36(a0) +; RV32ZVE32F-NEXT: lw s0, 8(a0) +; RV32ZVE32F-NEXT: lw s1, 12(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw t6, 20(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li s1, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, s1, v8 +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi s2, t0, 1 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 ; RV32ZVE32F-NEXT: bnez s2, .LBB46_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_11 ; RV32ZVE32F-NEXT: .LBB46_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: andi a0, t0, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_12 ; RV32ZVE32F-NEXT: .LBB46_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: andi a0, t0, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_13 ; RV32ZVE32F-NEXT: .LBB46_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: andi a0, t0, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_14 ; RV32ZVE32F-NEXT: .LBB46_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: andi a0, t0, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_15 ; RV32ZVE32F-NEXT: .LBB46_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: andi a0, t0, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_16 ; RV32ZVE32F-NEXT: .LBB46_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: andi a0, t0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_9 ; RV32ZVE32F-NEXT: .LBB46_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) @@ -4379,61 +4377,60 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB46_10: # %cond.store ; RV32ZVE32F-NEXT: .cfi_restore_state -; RV32ZVE32F-NEXT: lw s1, 0(a0) +; RV32ZVE32F-NEXT: lw a1, 0(a0) ; RV32ZVE32F-NEXT: lw a0, 4(a0) -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s s2, v10 -; RV32ZVE32F-NEXT: sw s1, 0(s2) +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw a1, 0(s2) ; RV32ZVE32F-NEXT: sw a0, 4(s2) -; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_2 ; RV32ZVE32F-NEXT: .LBB46_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: sw s0, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 0(a0) +; RV32ZVE32F-NEXT: sw s1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_3 ; RV32ZVE32F-NEXT: .LBB46_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t4, 0(a0) -; RV32ZVE32F-NEXT: sw t5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_4 ; RV32ZVE32F-NEXT: .LBB46_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t2, 0(a0) -; RV32ZVE32F-NEXT: sw t3, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_5 ; RV32ZVE32F-NEXT: .LBB46_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t0, 0(a0) -; RV32ZVE32F-NEXT: sw t1, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_6 ; RV32ZVE32F-NEXT: .LBB46_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a6, 0(a0) ; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: andi a0, t0, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB46_7 ; RV32ZVE32F-NEXT: .LBB46_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: andi a0, t0, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB46_8 ; RV32ZVE32F-NEXT: j .LBB46_9 ; @@ -4539,20 +4536,20 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_zext_v8i16_v8i64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulu.vx v14, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vzext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v14, v12, a1 +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-NEXT: vzext.vf2 v14, v12 +; RV64V-NEXT: vsll.vi v12, v14, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV64V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8i64: @@ -4572,47 +4569,47 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: lw a7, 44(a0) ; RV32ZVE32F-NEXT: lw a4, 48(a0) ; RV32ZVE32F-NEXT: lw a5, 52(a0) -; RV32ZVE32F-NEXT: lw t2, 24(a0) -; RV32ZVE32F-NEXT: lw t3, 28(a0) -; RV32ZVE32F-NEXT: lw t0, 32(a0) -; RV32ZVE32F-NEXT: lw t1, 36(a0) -; RV32ZVE32F-NEXT: lw t6, 8(a0) -; RV32ZVE32F-NEXT: lw s0, 12(a0) -; RV32ZVE32F-NEXT: lw t4, 16(a0) -; RV32ZVE32F-NEXT: lw t5, 20(a0) -; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a1 -; RV32ZVE32F-NEXT: li s1, 8 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v0 -; RV32ZVE32F-NEXT: andi s2, a1, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccu.vx v10, s1, v8 -; RV32ZVE32F-NEXT: bnez s2, .LBB47_10 +; RV32ZVE32F-NEXT: lw t3, 24(a0) +; RV32ZVE32F-NEXT: lw t4, 28(a0) +; RV32ZVE32F-NEXT: lw t1, 32(a0) +; RV32ZVE32F-NEXT: lw t2, 36(a0) +; RV32ZVE32F-NEXT: lw s0, 8(a0) +; RV32ZVE32F-NEXT: lw s1, 12(a0) +; RV32ZVE32F-NEXT: lw t5, 16(a0) +; RV32ZVE32F-NEXT: lw t6, 20(a0) +; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; RV32ZVE32F-NEXT: vmv.x.s t0, v0 +; RV32ZVE32F-NEXT: andi s2, t0, 1 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a1 +; RV32ZVE32F-NEXT: bnez s2, .LBB47_10 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_11 ; RV32ZVE32F-NEXT: .LBB47_2: # %else2 -; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: andi a0, t0, 4 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_12 ; RV32ZVE32F-NEXT: .LBB47_3: # %else4 -; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: andi a0, t0, 8 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_13 ; RV32ZVE32F-NEXT: .LBB47_4: # %else6 -; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: andi a0, t0, 16 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_14 ; RV32ZVE32F-NEXT: .LBB47_5: # %else8 -; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: andi a0, t0, 32 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_15 ; RV32ZVE32F-NEXT: .LBB47_6: # %else10 -; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: andi a0, t0, 64 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_16 ; RV32ZVE32F-NEXT: .LBB47_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: andi a0, t0, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_9 ; RV32ZVE32F-NEXT: .LBB47_8: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: sw a2, 0(a0) ; RV32ZVE32F-NEXT: sw a3, 4(a0) @@ -4628,61 +4625,60 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB47_10: # %cond.store ; RV32ZVE32F-NEXT: .cfi_restore_state -; RV32ZVE32F-NEXT: lw s1, 0(a0) +; RV32ZVE32F-NEXT: lw a1, 0(a0) ; RV32ZVE32F-NEXT: lw a0, 4(a0) -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s s2, v10 -; RV32ZVE32F-NEXT: sw s1, 0(s2) +; RV32ZVE32F-NEXT: vmv.x.s s2, v8 +; RV32ZVE32F-NEXT: sw a1, 0(s2) ; RV32ZVE32F-NEXT: sw a0, 4(s2) -; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: andi a0, t0, 2 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_2 ; RV32ZVE32F-NEXT: .LBB47_11: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t6, 0(a0) -; RV32ZVE32F-NEXT: sw s0, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw s0, 0(a0) +; RV32ZVE32F-NEXT: sw s1, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 4 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_3 ; RV32ZVE32F-NEXT: .LBB47_12: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t4, 0(a0) -; RV32ZVE32F-NEXT: sw t5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t5, 0(a0) +; RV32ZVE32F-NEXT: sw t6, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 8 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_4 ; RV32ZVE32F-NEXT: .LBB47_13: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t2, 0(a0) -; RV32ZVE32F-NEXT: sw t3, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t3, 0(a0) +; RV32ZVE32F-NEXT: sw t4, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 16 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_5 ; RV32ZVE32F-NEXT: .LBB47_14: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 -; RV32ZVE32F-NEXT: sw t0, 0(a0) -; RV32ZVE32F-NEXT: sw t1, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: sw t1, 0(a0) +; RV32ZVE32F-NEXT: sw t2, 4(a0) +; RV32ZVE32F-NEXT: andi a0, t0, 32 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_6 ; RV32ZVE32F-NEXT: .LBB47_15: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a6, 0(a0) ; RV32ZVE32F-NEXT: sw a7, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: andi a0, t0, 64 ; RV32ZVE32F-NEXT: beqz a0, .LBB47_7 ; RV32ZVE32F-NEXT: .LBB47_16: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 ; RV32ZVE32F-NEXT: sw a4, 0(a0) ; RV32ZVE32F-NEXT: sw a5, 4(a0) -; RV32ZVE32F-NEXT: andi a0, a1, -128 +; RV32ZVE32F-NEXT: andi a0, t0, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB47_8 ; RV32ZVE32F-NEXT: j .LBB47_9 ; @@ -4804,11 +4800,10 @@ define void @mscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> % ; ; RV64V-LABEL: mscatter_baseidx_v8i32_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v12, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v16, v12 +; RV64V-NEXT: vsll.vi v12, v16, 3 +; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i32_v8i64: @@ -5056,11 +5051,10 @@ define void @mscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; ; RV64V-LABEL: mscatter_baseidx_sext_v8i32_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v12, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v16, v12 +; RV64V-NEXT: vsll.vi v12, v16, 3 +; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8i64: @@ -5309,11 +5303,10 @@ define void @mscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i32_v8i64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulu.vx v16, v12, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vzext.vf2 v16, v12 +; RV64V-NEXT: vsll.vi v12, v16, 3 +; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8i64: @@ -6481,18 +6474,18 @@ define void @mscatter_baseidx_sext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8bf16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v9, v9 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v9, v10, v10 +; RV32-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8bf16: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwaddu.vv v10, v9, v9 -; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v10, v9 +; RV64V-NEXT: vadd.vv v9, v10, v10 +; RV64V-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8bf16: @@ -6631,8 +6624,10 @@ define void @mscatter_baseidx_zext_v8i8_v8bf16(<8 x bfloat> %val, ptr %base, <8 define void @mscatter_baseidx_v8bf16(<8 x bfloat> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_v8bf16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; @@ -7762,18 +7757,18 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v9, v9 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v9, v10, v10 +; RV32-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8f16: ; RV64V: # %bb.0: -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwaddu.vv v10, v9, v9 -; RV64V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v10, v9 +; RV64V-NEXT: vadd.vv v9, v10, v10 +; RV64V-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-ZVFH-LABEL: mscatter_baseidx_zext_v8i8_v8f16: @@ -8023,8 +8018,10 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v10, v10, v10 +; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; @@ -8811,20 +8808,20 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i8_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v11, v10, a1 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v11, v10 +; RV32-NEXT: vsll.vi v10, v11, 2 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8f32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v11, v10, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v11, v10 +; RV64V-NEXT: vsll.vi v10, v11, 2 ; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV64V-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f32: @@ -8950,11 +8947,10 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_v8i16_v8f32: @@ -9081,11 +9077,10 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_sext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8f32: @@ -9213,20 +9208,18 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32-LABEL: mscatter_baseidx_zext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v10, a1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8f32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v12, v10, a1 -; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64V-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-NEXT: vzext.vf2 v12, v10 +; RV64V-NEXT: vsll.vi v10, v12, 2 +; RV64V-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV64V-NEXT: ret ; ; RV64ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f32: @@ -9360,9 +9353,10 @@ define void @mscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idxs ; ; RV64V-LABEL: mscatter_baseidx_v8f32: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 4 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v12, v10, a1 +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v12, v10 +; RV64V-NEXT: vsll.vi v12, v12, 2 +; RV64V-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; @@ -10296,20 +10290,20 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_zext_v8i8_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32V-NEXT: vwmulu.vx v13, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32V-NEXT: vzext.vf2 v13, v12 +; RV32V-NEXT: vsll.vi v12, v13, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV32V-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i8_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64V-NEXT: vwmulu.vx v13, v12, a1 +; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64V-NEXT: vzext.vf2 v13, v12 +; RV64V-NEXT: vsll.vi v12, v13, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV64V-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i8_v8f64: @@ -10501,11 +10495,11 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_v8i16_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v14, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_v8i16_v8f64: @@ -10519,88 +10513,87 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i16_v8f64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a0 -; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a2, a0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB94_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a1, a0, 2 -; RV32ZVE32F-NEXT: bnez a1, .LBB94_10 +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB94_10 ; RV32ZVE32F-NEXT: .LBB94_2: # %else2 -; RV32ZVE32F-NEXT: andi a1, a0, 4 -; RV32ZVE32F-NEXT: bnez a1, .LBB94_11 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB94_11 ; RV32ZVE32F-NEXT: .LBB94_3: # %else4 -; RV32ZVE32F-NEXT: andi a1, a0, 8 -; RV32ZVE32F-NEXT: bnez a1, .LBB94_12 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB94_12 ; RV32ZVE32F-NEXT: .LBB94_4: # %else6 -; RV32ZVE32F-NEXT: andi a1, a0, 16 -; RV32ZVE32F-NEXT: bnez a1, .LBB94_13 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB94_13 ; RV32ZVE32F-NEXT: .LBB94_5: # %else8 -; RV32ZVE32F-NEXT: andi a1, a0, 32 -; RV32ZVE32F-NEXT: bnez a1, .LBB94_14 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB94_14 ; RV32ZVE32F-NEXT: .LBB94_6: # %else10 -; RV32ZVE32F-NEXT: andi a1, a0, 64 -; RV32ZVE32F-NEXT: bnez a1, .LBB94_15 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB94_15 ; RV32ZVE32F-NEXT: .LBB94_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB94_16 ; RV32ZVE32F-NEXT: .LBB94_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB94_9: # %cond.store -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v10 -; RV32ZVE32F-NEXT: fsd fa0, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 2 -; RV32ZVE32F-NEXT: beqz a1, .LBB94_2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB94_2 ; RV32ZVE32F-NEXT: .LBB94_10: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa1, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 4 -; RV32ZVE32F-NEXT: beqz a1, .LBB94_3 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB94_3 ; RV32ZVE32F-NEXT: .LBB94_11: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa2, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 8 -; RV32ZVE32F-NEXT: beqz a1, .LBB94_4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB94_4 ; RV32ZVE32F-NEXT: .LBB94_12: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa3, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 16 -; RV32ZVE32F-NEXT: beqz a1, .LBB94_5 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB94_5 ; RV32ZVE32F-NEXT: .LBB94_13: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa4, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 32 -; RV32ZVE32F-NEXT: beqz a1, .LBB94_6 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB94_6 ; RV32ZVE32F-NEXT: .LBB94_14: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa5, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 64 -; RV32ZVE32F-NEXT: beqz a1, .LBB94_7 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB94_7 ; RV32ZVE32F-NEXT: .LBB94_15: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa6, 0(a1) -; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB94_8 ; RV32ZVE32F-NEXT: .LBB94_16: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret @@ -10698,11 +10691,11 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16 define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_sext_v8i16_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulsu.vx v14, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vsext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_sext_v8i16_v8f64: @@ -10716,88 +10709,87 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 ; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i16_v8f64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a0 -; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vsext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a2, a0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccus.vx v10, a1, v8 +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB95_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a1, a0, 2 -; RV32ZVE32F-NEXT: bnez a1, .LBB95_10 +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB95_10 ; RV32ZVE32F-NEXT: .LBB95_2: # %else2 -; RV32ZVE32F-NEXT: andi a1, a0, 4 -; RV32ZVE32F-NEXT: bnez a1, .LBB95_11 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB95_11 ; RV32ZVE32F-NEXT: .LBB95_3: # %else4 -; RV32ZVE32F-NEXT: andi a1, a0, 8 -; RV32ZVE32F-NEXT: bnez a1, .LBB95_12 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB95_12 ; RV32ZVE32F-NEXT: .LBB95_4: # %else6 -; RV32ZVE32F-NEXT: andi a1, a0, 16 -; RV32ZVE32F-NEXT: bnez a1, .LBB95_13 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB95_13 ; RV32ZVE32F-NEXT: .LBB95_5: # %else8 -; RV32ZVE32F-NEXT: andi a1, a0, 32 -; RV32ZVE32F-NEXT: bnez a1, .LBB95_14 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB95_14 ; RV32ZVE32F-NEXT: .LBB95_6: # %else10 -; RV32ZVE32F-NEXT: andi a1, a0, 64 -; RV32ZVE32F-NEXT: bnez a1, .LBB95_15 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB95_15 ; RV32ZVE32F-NEXT: .LBB95_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB95_16 ; RV32ZVE32F-NEXT: .LBB95_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB95_9: # %cond.store -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v10 -; RV32ZVE32F-NEXT: fsd fa0, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 2 -; RV32ZVE32F-NEXT: beqz a1, .LBB95_2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB95_2 ; RV32ZVE32F-NEXT: .LBB95_10: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa1, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 4 -; RV32ZVE32F-NEXT: beqz a1, .LBB95_3 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB95_3 ; RV32ZVE32F-NEXT: .LBB95_11: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa2, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 8 -; RV32ZVE32F-NEXT: beqz a1, .LBB95_4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB95_4 ; RV32ZVE32F-NEXT: .LBB95_12: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa3, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 16 -; RV32ZVE32F-NEXT: beqz a1, .LBB95_5 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB95_5 ; RV32ZVE32F-NEXT: .LBB95_13: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa4, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 32 -; RV32ZVE32F-NEXT: beqz a1, .LBB95_6 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB95_6 ; RV32ZVE32F-NEXT: .LBB95_14: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa5, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 64 -; RV32ZVE32F-NEXT: beqz a1, .LBB95_7 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB95_7 ; RV32ZVE32F-NEXT: .LBB95_15: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa6, 0(a1) -; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB95_8 ; RV32ZVE32F-NEXT: .LBB95_16: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret @@ -10896,107 +10888,106 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m) { ; RV32V-LABEL: mscatter_baseidx_zext_v8i16_v8f64: ; RV32V: # %bb.0: -; RV32V-NEXT: li a1, 8 -; RV32V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32V-NEXT: vwmulu.vx v14, v12, a1 +; RV32V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32V-NEXT: vzext.vf2 v14, v12 +; RV32V-NEXT: vsll.vi v12, v14, 3 ; RV32V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32V-NEXT: ret ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i16_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64V-NEXT: vwmulu.vx v14, v12, a1 +; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64V-NEXT: vzext.vf2 v14, v12 +; RV64V-NEXT: vsll.vi v12, v14, 3 ; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV64V-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i16_v8f64: ; RV32ZVE32F: # %bb.0: ; RV32ZVE32F-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.v.x v10, a0 -; RV32ZVE32F-NEXT: li a1, 8 +; RV32ZVE32F-NEXT: vzext.vf2 v10, v8 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a0, v0 -; RV32ZVE32F-NEXT: andi a2, a0, 1 -; RV32ZVE32F-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32ZVE32F-NEXT: vwmaccu.vx v10, a1, v8 +; RV32ZVE32F-NEXT: vmv.x.s a1, v0 +; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32ZVE32F-NEXT: vsll.vi v8, v10, 3 +; RV32ZVE32F-NEXT: andi a2, a1, 1 +; RV32ZVE32F-NEXT: vadd.vx v8, v8, a0 ; RV32ZVE32F-NEXT: bnez a2, .LBB96_9 ; RV32ZVE32F-NEXT: # %bb.1: # %else -; RV32ZVE32F-NEXT: andi a1, a0, 2 -; RV32ZVE32F-NEXT: bnez a1, .LBB96_10 +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: bnez a0, .LBB96_10 ; RV32ZVE32F-NEXT: .LBB96_2: # %else2 -; RV32ZVE32F-NEXT: andi a1, a0, 4 -; RV32ZVE32F-NEXT: bnez a1, .LBB96_11 +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: bnez a0, .LBB96_11 ; RV32ZVE32F-NEXT: .LBB96_3: # %else4 -; RV32ZVE32F-NEXT: andi a1, a0, 8 -; RV32ZVE32F-NEXT: bnez a1, .LBB96_12 +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: bnez a0, .LBB96_12 ; RV32ZVE32F-NEXT: .LBB96_4: # %else6 -; RV32ZVE32F-NEXT: andi a1, a0, 16 -; RV32ZVE32F-NEXT: bnez a1, .LBB96_13 +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: bnez a0, .LBB96_13 ; RV32ZVE32F-NEXT: .LBB96_5: # %else8 -; RV32ZVE32F-NEXT: andi a1, a0, 32 -; RV32ZVE32F-NEXT: bnez a1, .LBB96_14 +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: bnez a0, .LBB96_14 ; RV32ZVE32F-NEXT: .LBB96_6: # %else10 -; RV32ZVE32F-NEXT: andi a1, a0, 64 -; RV32ZVE32F-NEXT: bnez a1, .LBB96_15 +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: bnez a0, .LBB96_15 ; RV32ZVE32F-NEXT: .LBB96_7: # %else12 -; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: bnez a0, .LBB96_16 ; RV32ZVE32F-NEXT: .LBB96_8: # %else14 ; RV32ZVE32F-NEXT: ret ; RV32ZVE32F-NEXT: .LBB96_9: # %cond.store -; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vmv.x.s a1, v10 -; RV32ZVE32F-NEXT: fsd fa0, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 2 -; RV32ZVE32F-NEXT: beqz a1, .LBB96_2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v8 +; RV32ZVE32F-NEXT: fsd fa0, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 2 +; RV32ZVE32F-NEXT: beqz a0, .LBB96_2 ; RV32ZVE32F-NEXT: .LBB96_10: # %cond.store1 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 1 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa1, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 4 -; RV32ZVE32F-NEXT: beqz a1, .LBB96_3 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 1 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa1, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 4 +; RV32ZVE32F-NEXT: beqz a0, .LBB96_3 ; RV32ZVE32F-NEXT: .LBB96_11: # %cond.store3 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 2 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa2, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 8 -; RV32ZVE32F-NEXT: beqz a1, .LBB96_4 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 2 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa2, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 8 +; RV32ZVE32F-NEXT: beqz a0, .LBB96_4 ; RV32ZVE32F-NEXT: .LBB96_12: # %cond.store5 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 3 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa3, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 16 -; RV32ZVE32F-NEXT: beqz a1, .LBB96_5 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 3 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa3, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 16 +; RV32ZVE32F-NEXT: beqz a0, .LBB96_5 ; RV32ZVE32F-NEXT: .LBB96_13: # %cond.store7 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 4 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa4, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 32 -; RV32ZVE32F-NEXT: beqz a1, .LBB96_6 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 4 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa4, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 32 +; RV32ZVE32F-NEXT: beqz a0, .LBB96_6 ; RV32ZVE32F-NEXT: .LBB96_14: # %cond.store9 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 5 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa5, 0(a1) -; RV32ZVE32F-NEXT: andi a1, a0, 64 -; RV32ZVE32F-NEXT: beqz a1, .LBB96_7 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 5 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa5, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, 64 +; RV32ZVE32F-NEXT: beqz a0, .LBB96_7 ; RV32ZVE32F-NEXT: .LBB96_15: # %cond.store11 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 6 -; RV32ZVE32F-NEXT: vmv.x.s a1, v8 -; RV32ZVE32F-NEXT: fsd fa6, 0(a1) -; RV32ZVE32F-NEXT: andi a0, a0, -128 +; RV32ZVE32F-NEXT: vslidedown.vi v10, v8, 6 +; RV32ZVE32F-NEXT: vmv.x.s a0, v10 +; RV32ZVE32F-NEXT: fsd fa6, 0(a0) +; RV32ZVE32F-NEXT: andi a0, a1, -128 ; RV32ZVE32F-NEXT: beqz a0, .LBB96_8 ; RV32ZVE32F-NEXT: .LBB96_16: # %cond.store13 ; RV32ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma -; RV32ZVE32F-NEXT: vslidedown.vi v8, v10, 7 +; RV32ZVE32F-NEXT: vslidedown.vi v8, v8, 7 ; RV32ZVE32F-NEXT: vmv.x.s a0, v8 ; RV32ZVE32F-NEXT: fsd fa7, 0(a0) ; RV32ZVE32F-NEXT: ret @@ -11111,11 +11102,10 @@ define void @mscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i32 ; ; RV64V-LABEL: mscatter_baseidx_v8i32_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v12, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v16, v12 +; RV64V-NEXT: vsll.vi v12, v16, 3 +; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_v8i32_v8f64: @@ -11313,11 +11303,10 @@ define void @mscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; ; RV64V-LABEL: mscatter_baseidx_sext_v8i32_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulsu.vx v16, v12, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vsext.vf2 v16, v12 +; RV64V-NEXT: vsll.vi v12, v16, 3 +; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_sext_v8i32_v8f64: @@ -11516,11 +11505,10 @@ define void @mscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; ; RV64V-LABEL: mscatter_baseidx_zext_v8i32_v8f64: ; RV64V: # %bb.0: -; RV64V-NEXT: li a1, 8 -; RV64V-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64V-NEXT: vwmulu.vx v16, v12, a1 -; RV64V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV64V-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64V-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64V-NEXT: vzext.vf2 v16, v12 +; RV64V-NEXT: vsll.vi v12, v16, 3 +; RV64V-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64V-NEXT: ret ; ; RV32ZVE32F-LABEL: mscatter_baseidx_zext_v8i32_v8f64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-peephole-vmerge-vops.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-peephole-vmerge-vops.ll index 016be04ffc9b9..2317f34d903da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-peephole-vmerge-vops.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-peephole-vmerge-vops.ll @@ -11,8 +11,10 @@ declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) define <8 x i32> @vpmerge_vpadd(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vadd.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -26,8 +28,9 @@ define <8 x i32> @vpmerge_vpadd2(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vmseq.vv v0, v9, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu -; CHECK-NEXT: vadd.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i1> splat (i1 true), i32 %vl) %m = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %x, <8 x i32> %y, metadata !"eq", <8 x i1> splat (i1 true), i32 %vl) @@ -39,8 +42,10 @@ define <8 x i32> @vpmerge_vpadd2(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y define <8 x i32> @vpmerge_vpadd3(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpadd3: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> splat (i1 true), <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -52,8 +57,10 @@ declare <8 x float> @llvm.vp.fadd.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) define <8 x float> @vpmerge_vpfadd(<8 x float> %passthru, <8 x float> %x, <8 x float> %y, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpfadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfadd.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> %x, <8 x float> %y, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> %m, <8 x float> %a, <8 x float> %passthru, i32 %vl) @@ -65,8 +72,10 @@ declare <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float>, <8 x i1>, i32) define <8 x i16> @vpmerge_vpfptosi(<8 x i16> %passthru, <8 x float> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpfptosi: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %a = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> %m, <8 x i16> %a, <8 x i16> %passthru, i32 %vl) @@ -78,8 +87,10 @@ declare <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64>, <8 x i1>, i32) define <8 x float> @vpmerge_vpsitofp(<8 x float> %passthru, <8 x i64> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpsitofp: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfncvt.f.x.w v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfncvt.f.x.w v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> %m, <8 x float> %a, <8 x float> %passthru, i32 %vl) @@ -91,8 +102,10 @@ declare <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8>, <8 x i1>, i32) define <8 x i32> @vpmerge_vpzext(<8 x i32> %passthru, <8 x i8> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpzext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vzext.vf4 v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf4 v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -104,8 +117,10 @@ declare <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64>, <8 x i1>, i32) define <8 x i32> @vpmerge_vptrunc(<8 x i32> %passthru, <8 x i64> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vptrunc: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vnsrl.wi v8, v10, 0, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -117,8 +132,10 @@ declare <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float>, <8 x i1>, i32) define <8 x double> @vpmerge_vpfpext(<8 x double> %passthru, <8 x float> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpfpext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfwcvt.f.f.v v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0 ; CHECK-NEXT: ret %a = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> %m, <8 x double> %a, <8 x double> %passthru, i32 %vl) @@ -130,8 +147,10 @@ declare <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double>, <8 x i1>, i32) define <8 x float> @vpmerge_vpfptrunc(<8 x float> %passthru, <8 x double> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpfptrunc: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfncvt.f.f.w v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfncvt.f.f.w v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> %m, <8 x float> %a, <8 x float> %passthru, i32 %vl) @@ -144,8 +163,10 @@ declare <8 x i32> @llvm.vp.load.v8i32.p0(ptr, <8 x i1>, i32) define <8 x i32> @vpmerge_vpload(<8 x i32> %passthru, ptr %p, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpmerge_vpload: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %p, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -157,9 +178,10 @@ define <8 x i32> @vpmerge_vpload2(<8 x i32> %passthru, ptr %p, <8 x i32> %x, <8 ; CHECK-LABEL: vpmerge_vpload2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v11, (a0) ; CHECK-NEXT: vmseq.vv v0, v9, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, mu -; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %p, <8 x i1> splat (i1 true), i32 %vl) %m = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %x, <8 x i32> %y, metadata !"eq", <8 x i1> splat (i1 true), i32 %vl) @@ -176,8 +198,9 @@ declare <8 x double> @llvm.vp.select.v8f64(<8 x i1>, <8 x double>, <8 x double>, define <8 x i32> @vpselect_vpadd(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vadd.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -188,9 +211,10 @@ define <8 x i32> @vpselect_vpadd(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y define <8 x i32> @vpselect_vpadd2(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> %y, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpadd2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vmseq.vv v0, v9, v10 -; CHECK-NEXT: vadd.vv v8, v9, v10, v0.t +; CHECK-NEXT: vadd.vv v9, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i1> splat (i1 true), i32 %vl) %m = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %x, <8 x i32> %y, metadata !"eq", <8 x i1> splat (i1 true), i32 %vl) @@ -214,8 +238,9 @@ define <8 x i32> @vpselect_vpadd3(<8 x i32> %passthru, <8 x i32> %x, <8 x i32> % define <8 x float> @vpselect_vpfadd(<8 x float> %passthru, <8 x float> %x, <8 x float> %y, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpfadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfadd.vv v8, v9, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v9, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x float> @llvm.vp.fadd.v8f32(<8 x float> %x, <8 x float> %y, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x float> @llvm.vp.select.v8f32(<8 x i1> %m, <8 x float> %a, <8 x float> %passthru, i32 %vl) @@ -226,8 +251,9 @@ define <8 x float> @vpselect_vpfadd(<8 x float> %passthru, <8 x float> %x, <8 x define <8 x i16> @vpselect_vpfptosi(<8 x i16> %passthru, <8 x float> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpfptosi: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfncvt.rtz.x.f.w v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfncvt.rtz.x.f.w v10, v9 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %a = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i16> @llvm.vp.select.v8i16(<8 x i1> %m, <8 x i16> %a, <8 x i16> %passthru, i32 %vl) @@ -238,8 +264,9 @@ define <8 x i16> @vpselect_vpfptosi(<8 x i16> %passthru, <8 x float> %x, <8 x i1 define <8 x float> @vpselect_vpsitofp(<8 x float> %passthru, <8 x i64> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpsitofp: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfncvt.f.x.w v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfncvt.f.x.w v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x float> @llvm.vp.select.v8f32(<8 x i1> %m, <8 x float> %a, <8 x float> %passthru, i32 %vl) @@ -250,8 +277,9 @@ define <8 x float> @vpselect_vpsitofp(<8 x float> %passthru, <8 x i64> %x, <8 x define <8 x i32> @vpselect_vpzext(<8 x i32> %passthru, <8 x i8> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpzext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vzext.vf4 v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf4 v10, v9 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -262,8 +290,9 @@ define <8 x i32> @vpselect_vpzext(<8 x i32> %passthru, <8 x i8> %x, <8 x i1> %m, define <8 x i32> @vpselect_vptrunc(<8 x i32> %passthru, <8 x i64> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vptrunc: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vnsrl.wi v8, v10, 0, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v9, v10, 0 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -274,8 +303,10 @@ define <8 x i32> @vpselect_vptrunc(<8 x i32> %passthru, <8 x i64> %x, <8 x i1> % define <8 x double> @vpselect_vpfpext(<8 x double> %passthru, <8 x float> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpfpext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfwcvt.f.f.v v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vmerge.vvm v8, v8, v12, v0 ; CHECK-NEXT: ret %a = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x double> @llvm.vp.select.v8f64(<8 x i1> %m, <8 x double> %a, <8 x double> %passthru, i32 %vl) @@ -286,8 +317,9 @@ define <8 x double> @vpselect_vpfpext(<8 x double> %passthru, <8 x float> %x, <8 define <8 x float> @vpselect_vpfptrunc(<8 x float> %passthru, <8 x double> %x, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpfptrunc: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfncvt.f.f.w v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfncvt.f.f.w v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> %x, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x float> @llvm.vp.select.v8f32(<8 x i1> %m, <8 x float> %a, <8 x float> %passthru, i32 %vl) @@ -298,8 +330,9 @@ define <8 x float> @vpselect_vpfptrunc(<8 x float> %passthru, <8 x double> %x, < define <8 x i32> @vpselect_vpload(<8 x i32> %passthru, ptr %p, <8 x i1> %m, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpload: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu -; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %p, <8 x i1> splat (i1 true), i32 %vl) %b = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> %m, <8 x i32> %a, <8 x i32> %passthru, i32 %vl) @@ -310,9 +343,10 @@ define <8 x i32> @vpselect_vpload(<8 x i32> %passthru, ptr %p, <8 x i1> %m, i32 define <8 x i32> @vpselect_vpload2(<8 x i32> %passthru, ptr %p, <8 x i32> %x, <8 x i32> %y, i32 zeroext %vl) { ; CHECK-LABEL: vpselect_vpload2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v11, (a0) ; CHECK-NEXT: vmseq.vv v0, v9, v10 -; CHECK-NEXT: vle32.v v8, (a0), v0.t +; CHECK-NEXT: vmerge.vvm v8, v8, v11, v0 ; CHECK-NEXT: ret %a = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr %p, <8 x i1> splat (i1 true), i32 %vl) %m = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> %x, <8 x i32> %y, metadata !"eq", <8 x i1> splat (i1 true), i32 %vl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index a426f8c619e99..744a9d02dd6a8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -307,11 +307,11 @@ define float @vreduce_fwadd_v1f32(<1 x half> %v, float %s) { define float @vreduce_ord_fwadd_v1f32(<1 x half> %v, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v1f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredosum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %e = fpext <1 x half> %v to <1 x float> @@ -352,12 +352,12 @@ define float @vreduce_ord_fadd_v2f32(ptr %x, float %s) { define float @vreduce_fwadd_v2f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_fwadd_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredusum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x half>, ptr %x @@ -369,12 +369,12 @@ define float @vreduce_fwadd_v2f32(ptr %x, float %s) { define float @vreduce_ord_fwadd_v2f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredosum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x half>, ptr %x @@ -416,12 +416,12 @@ define float @vreduce_ord_fadd_v4f32(ptr %x, float %s) { define float @vreduce_fwadd_v4f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_fwadd_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredusum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x half>, ptr %x @@ -433,12 +433,12 @@ define float @vreduce_fwadd_v4f32(ptr %x, float %s) { define float @vreduce_ord_fwadd_v4f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredosum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x half>, ptr %x @@ -555,12 +555,12 @@ define float @vreduce_ord_fadd_v8f32(ptr %x, float %s) { define float @vreduce_fwadd_v8f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_fwadd_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x half>, ptr %x @@ -572,12 +572,12 @@ define float @vreduce_fwadd_v8f32(ptr %x, float %s) { define float @vreduce_ord_fwadd_v8f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x half>, ptr %x @@ -619,12 +619,12 @@ define float @vreduce_ord_fadd_v16f32(ptr %x, float %s) { define float @vreduce_fwadd_v16f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_fwadd_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x half>, ptr %x @@ -636,12 +636,12 @@ define float @vreduce_fwadd_v16f32(ptr %x, float %s) { define float @vreduce_ord_fwadd_v16f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x half>, ptr %x @@ -686,12 +686,12 @@ define float @vreduce_fwadd_v32f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_fwadd_v32f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x half>, ptr %x @@ -704,12 +704,12 @@ define float @vreduce_ord_fwadd_v32f32(ptr %x, float %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v32f32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x half>, ptr %x @@ -763,13 +763,16 @@ define float @vreduce_fwadd_v64f32(ptr %x, float %s) { ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwadd.vv v24, v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmv.s.f v8, fa0 -; CHECK-NEXT: vfredusum.vs v8, v24, v8 +; CHECK-NEXT: vfadd.vv v8, v16, v24 +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x half>, ptr %x @@ -786,13 +789,15 @@ define float @vreduce_ord_fwadd_v64f32(ptr %x, float %s) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v24, fa0 +; CHECK-NEXT: vfmv.s.f v7, fa0 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v24 -; CHECK-NEXT: vfwredosum.vs v8, v16, v8 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfredosum.vs v16, v24, v7 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <64 x half>, ptr %x @@ -843,11 +848,11 @@ define double @vreduce_fwadd_v1f64(<1 x float> %v, double %s) { define double @vreduce_ord_fwadd_v1f64(<1 x float> %v, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v1f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredosum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %e = fpext <1 x float> %v to <1 x double> @@ -888,12 +893,12 @@ define double @vreduce_ord_fadd_v2f64(ptr %x, double %s) { define double @vreduce_fwadd_v2f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_fwadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredusum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %x @@ -905,12 +910,12 @@ define double @vreduce_fwadd_v2f64(ptr %x, double %s) { define double @vreduce_ord_fwadd_v2f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmv.s.f v8, fa0 +; CHECK-NEXT: vfredosum.vs v8, v9, v8 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <2 x float>, ptr %x @@ -952,12 +957,12 @@ define double @vreduce_ord_fadd_v4f64(ptr %x, double %s) { define double @vreduce_fwadd_v4f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_fwadd_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %x @@ -969,12 +974,12 @@ define double @vreduce_fwadd_v4f64(ptr %x, double %s) { define double @vreduce_ord_fwadd_v4f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v9, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <4 x float>, ptr %x @@ -1016,12 +1021,12 @@ define double @vreduce_ord_fadd_v8f64(ptr %x, double %s) { define double @vreduce_fwadd_v8f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_fwadd_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, ptr %x @@ -1033,12 +1038,12 @@ define double @vreduce_fwadd_v8f64(ptr %x, double %s) { define double @vreduce_ord_fwadd_v8f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v10, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v10 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 ; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmv.s.f v12, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <8 x float>, ptr %x @@ -1080,12 +1085,12 @@ define double @vreduce_ord_fadd_v16f64(ptr %x, double %s) { define double @vreduce_fwadd_v16f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_fwadd_v16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vfwredusum.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %x @@ -1097,12 +1102,12 @@ define double @vreduce_fwadd_v16f64(ptr %x, double %s) { define double @vreduce_ord_fwadd_v16f64(ptr %x, double %s) { ; CHECK-LABEL: vreduce_ord_fwadd_v16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfmv.s.f v12, fa0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v12 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <16 x float>, ptr %x @@ -1153,13 +1158,16 @@ define double @vreduce_fwadd_v32f64(ptr %x, double %s) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vslidedown.vi v8, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwadd.vv v24, v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfmv.s.f v8, fa0 -; CHECK-NEXT: vfredusum.vs v8, v24, v8 +; CHECK-NEXT: vfadd.vv v8, v16, v24 +; CHECK-NEXT: vfmv.s.f v16, fa0 +; CHECK-NEXT: vfredusum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, ptr %x @@ -1175,13 +1183,15 @@ define double @vreduce_ord_fwadd_v32f64(ptr %x, double %s) { ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 16, e64, m1, ta, ma -; CHECK-NEXT: vfmv.s.f v24, fa0 +; CHECK-NEXT: vfmv.s.f v7, fa0 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwredosum.vs v8, v8, v24 -; CHECK-NEXT: vfwredosum.vs v8, v16, v8 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 ; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfredosum.vs v16, v24, v7 +; CHECK-NEXT: vfredosum.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret %v = load <32 x float>, ptr %x @@ -1981,26 +1991,26 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v24, (a2) +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfmin.vv v24, v16, v24 ; CHECK-NEXT: csrr a0, vlenb @@ -2025,14 +2035,14 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 @@ -2663,26 +2673,26 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: addi a2, a0, 128 +; CHECK-NEXT: addi a2, a0, 256 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v24, (a2) +; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: addi a1, a0, 256 +; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmfeq.vv v0, v16, v16 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 +; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfmax.vv v24, v16, v24 ; CHECK-NEXT: csrr a0, vlenb @@ -2707,14 +2717,14 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 +; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 ; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: vmfne.vv v16, v8, v8 ; CHECK-NEXT: vcpop.m a0, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll index d3a36525115c8..3eee8b9021659 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll @@ -222,10 +222,9 @@ define i16 @vwreduce_add_v2i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <2 x i8>, ptr %x @@ -239,10 +238,9 @@ define i16 @vwreduce_uadd_v2i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <2 x i8>, ptr %x @@ -272,10 +270,9 @@ define i16 @vwreduce_add_v4i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <4 x i8>, ptr %x @@ -289,10 +286,9 @@ define i16 @vwreduce_uadd_v4i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <4 x i8>, ptr %x @@ -322,10 +318,9 @@ define i16 @vwreduce_add_v8i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <8 x i8>, ptr %x @@ -339,10 +334,9 @@ define i16 @vwreduce_uadd_v8i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <8 x i8>, ptr %x @@ -371,11 +365,10 @@ define i16 @vwreduce_add_v16i16(ptr %x) { ; CHECK-LABEL: vwreduce_add_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i8>, ptr %x @@ -388,11 +381,10 @@ define i16 @vwreduce_uadd_v16i16(ptr %x) { ; CHECK-LABEL: vwreduce_uadd_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i8>, ptr %x @@ -423,11 +415,10 @@ define i16 @vwreduce_add_v32i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %x @@ -441,11 +432,10 @@ define i16 @vwreduce_uadd_v32i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %x @@ -476,11 +466,10 @@ define i16 @vwreduce_add_v64i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <64 x i8>, ptr %x @@ -494,11 +483,10 @@ define i16 @vwreduce_uadd_v64i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 64 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <64 x i8>, ptr %x @@ -534,13 +522,15 @@ define i16 @vwreduce_add_v128i16(ptr %x) { ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwadd.vv v24, v8, v16 -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vadd.vv v8, v16, v24 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <128 x i8>, ptr %x @@ -556,13 +546,15 @@ define i16 @vwreduce_uadd_v128i16(ptr %x) { ; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: li a0, 64 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v24, v8, v16 -; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vadd.vv v8, v16, v24 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <128 x i8>, ptr %x @@ -628,10 +620,9 @@ define i32 @vwreduce_add_v2i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <2 x i16>, ptr %x @@ -645,10 +636,9 @@ define i32 @vwreduce_uadd_v2i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <2 x i16>, ptr %x @@ -678,10 +668,9 @@ define i32 @vwreduce_add_v4i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <4 x i16>, ptr %x @@ -695,10 +684,9 @@ define i32 @vwreduce_uadd_v4i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <4 x i16>, ptr %x @@ -727,11 +715,10 @@ define i32 @vwreduce_add_v8i32(ptr %x) { ; CHECK-LABEL: vwreduce_add_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <8 x i16>, ptr %x @@ -744,11 +731,10 @@ define i32 @vwreduce_uadd_v8i32(ptr %x) { ; CHECK-LABEL: vwreduce_uadd_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <8 x i16>, ptr %x @@ -777,11 +763,10 @@ define i32 @vwreduce_add_v16i32(ptr %x) { ; CHECK-LABEL: vwreduce_add_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %x @@ -794,11 +779,10 @@ define i32 @vwreduce_uadd_v16i32(ptr %x) { ; CHECK-LABEL: vwreduce_uadd_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v10, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vmv.s.x v12, zero +; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %x @@ -829,11 +813,10 @@ define i32 @vwreduce_add_v32i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vwredsum.vs v8, v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <32 x i16>, ptr %x @@ -847,11 +830,10 @@ define i32 @vwreduce_uadd_v32i32(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vmv.s.x v12, zero -; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <32 x i16>, ptr %x @@ -887,13 +869,15 @@ define i32 @vwreduce_add_v64i32(ptr %x) { ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwadd.vv v24, v8, v16 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vadd.vv v8, v16, v24 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <64 x i16>, ptr %x @@ -909,13 +893,15 @@ define i32 @vwreduce_uadd_v64i32(ptr %x) { ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v24, v8, v16 -; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v24, v8 +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vadd.vv v8, v16, v24 +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %v = load <64 x i16>, ptr %x @@ -1037,10 +1023,9 @@ define i64 @vwreduce_add_v2i64(ptr %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwredsum.vs v8, v8, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmv.s.x v8, zero +; RV64-NEXT: vredsum.vs v8, v9, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <2 x i32>, ptr %x @@ -1068,10 +1053,9 @@ define i64 @vwreduce_uadd_v2i64(ptr %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwredsumu.vs v8, v8, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vmv.s.x v8, zero +; RV64-NEXT: vredsum.vs v8, v9, v8 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <2 x i32>, ptr %x @@ -1127,11 +1111,10 @@ define i64 @vwreduce_add_v4i64(ptr %x) { ; RV64-LABEL: vwreduce_add_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vwredsum.vs v8, v8, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0) +; RV64-NEXT: vsext.vf2 v8, v10 +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vredsum.vs v8, v8, v10 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <4 x i32>, ptr %x @@ -1158,11 +1141,10 @@ define i64 @vwreduce_uadd_v4i64(ptr %x) { ; RV64-LABEL: vwreduce_uadd_v4i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v9, zero -; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vwredsumu.vs v8, v8, v9 -; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vle32.v v10, (a0) +; RV64-NEXT: vzext.vf2 v8, v10 +; RV64-NEXT: vmv.s.x v10, zero +; RV64-NEXT: vredsum.vs v8, v8, v10 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <4 x i32>, ptr %x @@ -1218,11 +1200,10 @@ define i64 @vwreduce_add_v8i64(ptr %x) { ; RV64-LABEL: vwreduce_add_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vwredsum.vs v8, v8, v10 -; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0) +; RV64-NEXT: vsext.vf2 v8, v12 +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vredsum.vs v8, v8, v12 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <8 x i32>, ptr %x @@ -1249,11 +1230,10 @@ define i64 @vwreduce_uadd_v8i64(ptr %x) { ; RV64-LABEL: vwreduce_uadd_v8i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v10, zero -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vwredsumu.vs v8, v8, v10 -; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vle32.v v12, (a0) +; RV64-NEXT: vzext.vf2 v8, v12 +; RV64-NEXT: vmv.s.x v12, zero +; RV64-NEXT: vredsum.vs v8, v8, v12 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <8 x i32>, ptr %x @@ -1309,11 +1289,10 @@ define i64 @vwreduce_add_v16i64(ptr %x) { ; RV64-LABEL: vwreduce_add_v16i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64-NEXT: vwredsum.vs v8, v8, v12 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vsext.vf2 v8, v16 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <16 x i32>, ptr %x @@ -1340,11 +1319,10 @@ define i64 @vwreduce_uadd_v16i64(ptr %x) { ; RV64-LABEL: vwreduce_uadd_v16i64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: vmv.s.x v12, zero -; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; RV64-NEXT: vwredsumu.vs v8, v8, v12 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vzext.vf2 v8, v16 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <16 x i32>, ptr %x @@ -1394,13 +1372,15 @@ define i64 @vwreduce_add_v32i64(ptr %x) { ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vsext.vf2 v16, v8 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v24, v8, v16 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vmv.s.x v8, zero -; RV32-NEXT: vredsum.vs v8, v24, v8 +; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vsext.vf2 v24, v8 +; RV32-NEXT: vadd.vv v8, v16, v24 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 @@ -1412,13 +1392,15 @@ define i64 @vwreduce_add_v32i64(ptr %x) { ; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v16, v8 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwadd.vv v24, v8, v16 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.s.x v8, zero -; RV64-NEXT: vredsum.vs v8, v24, v8 +; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vadd.vv v8, v16, v24 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <32 x i32>, ptr %x @@ -1433,13 +1415,15 @@ define i64 @vwreduce_uadd_v32i64(ptr %x) { ; RV32-NEXT: li a1, 32 ; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vzext.vf2 v16, v8 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v8, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v24, v8, v16 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vmv.s.x v8, zero -; RV32-NEXT: vredsum.vs v8, v24, v8 +; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vzext.vf2 v24, v8 +; RV32-NEXT: vadd.vv v8, v16, v24 +; RV32-NEXT: vmv.s.x v16, zero +; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; RV32-NEXT: vsrl.vx v8, v8, a1 @@ -1451,13 +1435,15 @@ define i64 @vwreduce_uadd_v32i64(ptr %x) { ; RV64-NEXT: li a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v16, v8 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v8, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwaddu.vv v24, v8, v16 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.s.x v8, zero -; RV64-NEXT: vredsum.vs v8, v24, v8 +; RV64-NEXT: vslidedown.vi v8, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vadd.vv v8, v16, v24 +; RV64-NEXT: vmv.s.x v16, zero +; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: ret %v = load <32 x i32>, ptr %x @@ -1516,26 +1502,28 @@ define i64 @vreduce_add_v64i64(ptr %x) nounwind { define i64 @vwreduce_add_v64i64(ptr %x) { ; RV32-LABEL: vwreduce_add_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vsext.vf2 v24, v16 +; RV32-NEXT: vsext.vf2 v0, v8 +; RV32-NEXT: vadd.vv v24, v24, v0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v0, v8, v16 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v16, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwadd.vv v16, v24, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vslidedown.vi v16, v16, 16 +; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vsext.vf2 v0, v16 +; RV32-NEXT: vsext.vf2 v16, v8 ; RV32-NEXT: vadd.vv v8, v0, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1544,30 +1532,45 @@ define i64 @vwreduce_add_v64i64(ptr %x) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vle32.v v8, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v0, v16 ; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; RV64-NEXT: vslidedown.vi v0, v16, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwadd.vv v24, v8, v16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vwadd.vv v8, v16, v0 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsext.vf2 v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vadd.vv v16, v0, v16 ; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -1582,26 +1585,28 @@ define i64 @vwreduce_add_v64i64(ptr %x) { define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV32-LABEL: vwreduce_uadd_v64i64: ; RV32: # %bb.0: -; RV32-NEXT: li a1, 32 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, a0, 128 +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-NEXT: vle32.v v16, (a0) +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vzext.vf2 v24, v16 +; RV32-NEXT: vzext.vf2 v0, v8 +; RV32-NEXT: vadd.vv v24, v24, v0 ; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v8, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v0, v8, v16 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v8, v16, 16 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vwaddu.vv v16, v24, v8 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vslidedown.vi v16, v16, 16 +; RV32-NEXT: vslidedown.vi v8, v8, 16 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vzext.vf2 v0, v16 +; RV32-NEXT: vzext.vf2 v16, v8 ; RV32-NEXT: vadd.vv v8, v0, v16 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vmv.s.x v16, zero ; RV32-NEXT: vredsum.vs v8, v8, v16 ; RV32-NEXT: vmv.x.s a0, v8 ; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a1 +; RV32-NEXT: vsrl.vx v8, v8, a2 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: ret ; @@ -1610,30 +1615,45 @@ define i64 @vwreduce_uadd_v64i64(ptr %x) { ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; RV64-NEXT: li a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, a0, 128 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: li a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV64-NEXT: vle32.v v16, (a0) +; RV64-NEXT: vle32.v v8, (a1) ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 +; RV64-NEXT: vslidedown.vi v24, v16, 16 +; RV64-NEXT: vslidedown.vi v0, v8, 16 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v0, v16 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vzext.vf2 v16, v8 +; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vzext.vf2 v8, v0 ; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; RV64-NEXT: vslidedown.vi v0, v16, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwaddu.vv v24, v8, v16 -; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; RV64-NEXT: vwaddu.vv v8, v16, v0 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vadd.vv v16, v0, v16 ; RV64-NEXT: vadd.vv v8, v24, v8 +; RV64-NEXT: vadd.vv v8, v16, v8 ; RV64-NEXT: vmv.s.x v16, zero ; RV64-NEXT: vredsum.vs v8, v8, v16 ; RV64-NEXT: vmv.x.s a0, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: slli a1, a1, 4 ; RV64-NEXT: add sp, sp, a1 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll index c0a213034c95b..440bf2b632201 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-sad.ll @@ -10,10 +10,9 @@ define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) { ; CHECK-NEXT: vmaxu.vv v8, v8, v9 ; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vredsum.vs v8, v9, v8 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret entry: @@ -53,12 +52,11 @@ define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vminu.vv v10, v8, v9 ; CHECK-NEXT: vmaxu.vv v8, v8, v9 -; CHECK-NEXT: vsub.vv v8, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vmv.s.x v9, zero -; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vwredsumu.vs v8, v8, v9 +; CHECK-NEXT: vsub.vv v10, v8, v10 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vmv.s.x v10, zero +; CHECK-NEXT: vredsum.vs v8, v8, v10 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret entry: @@ -108,29 +106,36 @@ define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea ; CHECK-NEXT: vle8.v v13, (a1) ; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: add a1, a1, a3 +; CHECK-NEXT: vle8.v v18, (a0) +; CHECK-NEXT: vle8.v v19, (a1) ; CHECK-NEXT: vminu.vv v14, v8, v9 ; CHECK-NEXT: vmaxu.vv v8, v8, v9 -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vsub.vv v8, v8, v14 -; CHECK-NEXT: vminu.vv v14, v10, v11 +; CHECK-NEXT: vsub.vv v14, v8, v14 +; CHECK-NEXT: vminu.vv v15, v10, v11 ; CHECK-NEXT: vmaxu.vv v10, v10, v11 -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vsub.vv v10, v10, v14 -; CHECK-NEXT: vminu.vv v14, v12, v13 -; CHECK-NEXT: vmaxu.vv v15, v12, v13 -; CHECK-NEXT: vwaddu.vv v12, v10, v8 -; CHECK-NEXT: vsub.vv v8, v15, v14 -; CHECK-NEXT: vminu.vv v10, v9, v11 -; CHECK-NEXT: vmaxu.vv v9, v9, v11 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vzext.vf2 v14, v8 +; CHECK-NEXT: vzext.vf2 v8, v14 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vsub.vv v14, v10, v15 +; CHECK-NEXT: vminu.vv v15, v12, v13 +; CHECK-NEXT: vmaxu.vv v12, v12, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v14 ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma -; CHECK-NEXT: vsub.vv v16, v9, v10 +; CHECK-NEXT: vsub.vv v12, v12, v15 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v14, v12 +; CHECK-NEXT: vadd.vv v16, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf4 v8, v12 ; CHECK-NEXT: vzext.vf2 v12, v16 -; CHECK-NEXT: vwaddu.wv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma +; CHECK-NEXT: vminu.vv v12, v18, v19 +; CHECK-NEXT: vmaxu.vv v13, v18, v19 +; CHECK-NEXT: vsub.vv v16, v13, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf4 v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll index 22956f8fe3551..e813611f50267 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll @@ -8,8 +8,9 @@ define <1 x i32> @select_addsub_v1i32(<1 x i1> %cc, <1 x i32> %a, <1 x i32> %b) { ; CHECK-LABEL: select_addsub_v1i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <1 x i32> %a, %b @@ -21,8 +22,9 @@ define <1 x i32> @select_addsub_v1i32(<1 x i1> %cc, <1 x i32> %a, <1 x i32> %b) define <2 x i32> @select_addsub_v2i32(<2 x i1> %cc, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: select_addsub_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <2 x i32> %a, %b @@ -34,8 +36,9 @@ define <2 x i32> @select_addsub_v2i32(<2 x i1> %cc, <2 x i32> %a, <2 x i32> %b) define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -47,9 +50,10 @@ define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_select_swapped: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmnot.m v0, v0 -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -61,8 +65,9 @@ define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a, define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_add_swapped: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -74,9 +79,10 @@ define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4 define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_both_swapped: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmnot.m v0, v0 -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -88,8 +94,9 @@ define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, < define <4 x i32> @select_addsub_v4i32_sub_swapped(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_sub_swapped: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vrsub.vi v10, v8, 0 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %sub = sub <4 x i32> %b, %a @@ -101,8 +108,9 @@ define <4 x i32> @select_addsub_v4i32_sub_swapped(<4 x i1> %cc, <4 x i32> %a, <4 define <8 x i32> @select_addsub_v8i32(<8 x i1> %cc, <8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: select_addsub_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu -; CHECK-NEXT: vrsub.vi v10, v10, 0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vrsub.vi v12, v10, 0 +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %sub = sub <8 x i32> %a, %b @@ -114,8 +122,9 @@ define <8 x i32> @select_addsub_v8i32(<8 x i1> %cc, <8 x i32> %a, <8 x i32> %b) define <16 x i32> @select_addsub_v16i32(<16 x i1> %cc, <16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: select_addsub_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; CHECK-NEXT: vrsub.vi v12, v12, 0, v0.t +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vrsub.vi v16, v12, 0 +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %sub = sub <16 x i32> %a, %b @@ -128,8 +137,9 @@ define <32 x i32> @select_addsub_v32i32(<32 x i1> %cc, <32 x i32> %a, <32 x i32> ; CHECK-LABEL: select_addsub_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, mu -; CHECK-NEXT: vrsub.vi v16, v16, 0, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vrsub.vi v24, v16, 0 +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 ; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %sub = sub <32 x i32> %a, %b @@ -144,29 +154,39 @@ define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma -; CHECK-NEXT: vmv8r.v v16, v8 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vrsub.vi v16, v8, 0 +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v0, 4 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu -; CHECK-NEXT: vadd.vv v8, v16, v8 -; CHECK-NEXT: vrsub.vi v24, v24, 0, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vrsub.vi v16, v24, 0 +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v16, v16, v24 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v24, v8 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v16, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -181,8 +201,9 @@ define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32> define <8 x i64> @select_addsub_v8i64(<8 x i1> %cc, <8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: select_addsub_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vrsub.vi v12, v12, 0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vrsub.vi v16, v12, 0 +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %sub = sub <8 x i64> %a, %b @@ -194,8 +215,9 @@ define <8 x i64> @select_addsub_v8i64(<8 x i1> %cc, <8 x i64> %a, <8 x i64> %b) define <8 x i16> @select_addsub_v8i16(<8 x i1> %cc, <8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: select_addsub_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <8 x i16> %a, %b @@ -207,8 +229,9 @@ define <8 x i16> @select_addsub_v8i16(<8 x i1> %cc, <8 x i16> %a, <8 x i16> %b) define <8 x i8> @select_addsub_v8i8(<8 x i1> %cc, <8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: select_addsub_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <8 x i8> %a, %b @@ -232,8 +255,9 @@ define <8 x i1> @select_addsub_v8i1(<8 x i1> %cc, <8 x i1> %a, <8 x i1> %b) { define <8 x i2> @select_addsub_v8i2(<8 x i1> %cc, <8 x i2> %a, <8 x i2> %b) { ; CHECK-LABEL: select_addsub_v8i2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <8 x i2> %a, %b @@ -245,9 +269,10 @@ define <8 x i2> @select_addsub_v8i2(<8 x i1> %cc, <8 x i2> %a, <8 x i2> %b) { define <4 x i32> @select_addsub_v4i32_constmask(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_constmask: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -259,9 +284,10 @@ define <4 x i32> @select_addsub_v4i32_constmask(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @select_addsub_v4i32_constmask2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_constmask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vrsub.vi v10, v8, 0 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %sub = sub <4 x i32> %b, %a @@ -274,9 +300,10 @@ define <4 x i32> @select_addsub_v4i32_constmask2(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @select_addsub_v4i32_as_shuffle(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_as_shuffle: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 5 -; CHECK-NEXT: vrsub.vi v9, v9, 0, v0.t +; CHECK-NEXT: vrsub.vi v10, v9, 0 +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: ret %sub = sub <4 x i32> %a, %b @@ -289,9 +316,10 @@ define <4 x i32> @select_addsub_v4i32_as_shuffle(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @select_addsub_v4i32_as_shuffle2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: select_addsub_v4i32_as_shuffle2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmv.v.i v0, 10 -; CHECK-NEXT: vrsub.vi v8, v8, 0, v0.t +; CHECK-NEXT: vrsub.vi v10, v8, 0 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %sub = sub <4 x i32> %b, %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-int.ll index 69cc7c36b793b..4fa1252fa505a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-int.ll @@ -10,9 +10,10 @@ define <1 x i1> @select_v1i1(i1 zeroext %c, <1 x i1> %a, <1 x i1> %b) { ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select i1 %c, <1 x i1> %a, <1 x i1> %b ret <1 x i1> %v @@ -26,9 +27,10 @@ define <1 x i1> @selectcc_v1i1(i1 signext %a, i1 signext %b, <1 x i1> %c, <1 x i ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.s.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %cmp = icmp ne i1 %a, %b %v = select i1 %cmp, <1 x i1> %c, <1 x i1> %d @@ -41,9 +43,10 @@ define <2 x i1> @select_v2i1(i1 zeroext %c, <2 x i1> %a, <2 x i1> %b) { ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select i1 %c, <2 x i1> %a, <2 x i1> %b ret <2 x i1> %v @@ -57,9 +60,10 @@ define <2 x i1> @selectcc_v2i1(i1 signext %a, i1 signext %b, <2 x i1> %c, <2 x i ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %cmp = icmp ne i1 %a, %b %v = select i1 %cmp, <2 x i1> %c, <2 x i1> %d @@ -72,9 +76,10 @@ define <4 x i1> @select_v4i1(i1 zeroext %c, <4 x i1> %a, <4 x i1> %b) { ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select i1 %c, <4 x i1> %a, <4 x i1> %b ret <4 x i1> %v @@ -88,9 +93,10 @@ define <4 x i1> @selectcc_v4i1(i1 signext %a, i1 signext %b, <4 x i1> %c, <4 x i ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %cmp = icmp ne i1 %a, %b %v = select i1 %cmp, <4 x i1> %c, <4 x i1> %d @@ -103,9 +109,10 @@ define <8 x i1> @select_v8i1(i1 zeroext %c, <8 x i1> %a, <8 x i1> %b) { ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select i1 %c, <8 x i1> %a, <8 x i1> %b ret <8 x i1> %v @@ -119,9 +126,10 @@ define <8 x i1> @selectcc_v8i1(i1 signext %a, i1 signext %b, <8 x i1> %c, <8 x i ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %cmp = icmp ne i1 %a, %b %v = select i1 %cmp, <8 x i1> %c, <8 x i1> %d @@ -134,9 +142,10 @@ define <16 x i1> @select_v16i1(i1 zeroext %c, <16 x i1> %a, <16 x i1> %b) { ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select i1 %c, <16 x i1> %a, <16 x i1> %b ret <16 x i1> %v @@ -150,9 +159,10 @@ define <16 x i1> @selectcc_v16i1(i1 signext %a, i1 signext %b, <16 x i1> %c, <16 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmv.v.x v9, a0 ; CHECK-NEXT: vmsne.vi v9, v9, 0 -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %cmp = icmp ne i1 %a, %b %v = select i1 %cmp, <16 x i1> %c, <16 x i1> %d diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll index ba64655947602..9bafa20bc790f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll @@ -545,7 +545,8 @@ define <8 x i1> @fcmp_ueq_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmflt.vv v10, v8, v9, v0.t ; ZVFH-NEXT: vmflt.vv v8, v9, v8, v0.t -; ZVFH-NEXT: vmnor.mm v0, v8, v10 +; ZVFH-NEXT: vmor.mm v8, v8, v10 +; ZVFH-NEXT: vmnot.m v0, v8 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fcmp_ueq_vv_v8f16: @@ -556,7 +557,8 @@ define <8 x i1> @fcmp_ueq_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v8, v12, v10, v0.t ; ZVFHMIN-NEXT: vmflt.vv v9, v10, v12, v0.t -; ZVFHMIN-NEXT: vmnor.mm v0, v9, v8 +; ZVFHMIN-NEXT: vmor.mm v8, v9, v8 +; ZVFHMIN-NEXT: vmnot.m v0, v8 ; ZVFHMIN-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ueq", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -568,7 +570,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmflt.vf v9, v8, fa0, v0.t ; ZVFH-NEXT: vmfgt.vf v8, v8, fa0, v0.t -; ZVFH-NEXT: vmnor.mm v0, v8, v9 +; ZVFH-NEXT: vmor.mm v8, v8, v9 +; ZVFH-NEXT: vmnot.m v0, v8 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fcmp_ueq_vf_v8f16: @@ -581,7 +584,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i32 zer ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v12, v10, v8, v0.t ; ZVFHMIN-NEXT: vmflt.vv v13, v8, v10, v0.t -; ZVFHMIN-NEXT: vmnor.mm v0, v13, v12 +; ZVFHMIN-NEXT: vmor.mm v8, v13, v12 +; ZVFHMIN-NEXT: vmnot.m v0, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -595,7 +599,8 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; ZVFH-NEXT: vmflt.vf v8, v8, fa0, v0.t -; ZVFH-NEXT: vmnor.mm v0, v8, v9 +; ZVFH-NEXT: vmor.mm v8, v8, v9 +; ZVFH-NEXT: vmnot.m v0, v8 ; ZVFH-NEXT: ret ; ; ZVFHMIN-LABEL: fcmp_ueq_vf_swap_v8f16: @@ -608,7 +613,8 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmflt.vv v12, v8, v10, v0.t ; ZVFHMIN-NEXT: vmflt.vv v13, v10, v8, v0.t -; ZVFHMIN-NEXT: vmnor.mm v0, v13, v12 +; ZVFHMIN-NEXT: vmor.mm v8, v13, v12 +; ZVFHMIN-NEXT: vmnot.m v0, v8 ; ZVFHMIN-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -3651,7 +3657,8 @@ define <8 x i1> @fcmp_ueq_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmflt.vv v16, v8, v12, v0.t ; CHECK-NEXT: vmflt.vv v17, v12, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v17, v16 +; CHECK-NEXT: vmor.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ueq", <8 x i1> %m, i32 %evl) ret <8 x i1> %v @@ -3663,7 +3670,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f64(<8 x double> %va, double %b, <8 x i1> %m, i32 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -3677,7 +3685,8 @@ define <8 x i1> @fcmp_ueq_vf_swap_v8f64(<8 x double> %va, double %b, <8 x i1> %m ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll index f2353e7d028bd..e0c748e414acc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll @@ -35,8 +35,8 @@ define <8 x i32> @concat_4xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; VLS-NEXT: vmv1r.v v13, v10 ; VLS-NEXT: vmv1r.v v12, v8 -; VLS-NEXT: vslideup.vi v13, v11, 2 ; VLS-NEXT: vslideup.vi v12, v9, 2 +; VLS-NEXT: vslideup.vi v13, v11, 2 ; VLS-NEXT: vmv2r.v v8, v12 ; VLS-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> @@ -70,8 +70,8 @@ define <8 x i32> @concat_8xv1i32(<1 x i32> %a, <1 x i32> %b, <1 x i32> %c, <1 x ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; VLS-NEXT: vslideup.vi v17, v14, 2 ; VLS-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; VLS-NEXT: vslideup.vi v10, v11, 1 ; VLS-NEXT: vslideup.vi v16, v9, 1 +; VLS-NEXT: vslideup.vi v10, v11, 1 ; VLS-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; VLS-NEXT: vslideup.vi v16, v10, 2 ; VLS-NEXT: vmv2r.v v8, v16 @@ -145,10 +145,10 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x ; VLS-NEXT: vmv1r.v v18, v12 ; VLS-NEXT: vmv1r.v v17, v10 ; VLS-NEXT: vmv1r.v v16, v8 -; VLS-NEXT: vslideup.vi v19, v15, 2 ; VLS-NEXT: vslideup.vi v18, v13, 2 -; VLS-NEXT: vslideup.vi v17, v11, 2 +; VLS-NEXT: vslideup.vi v19, v15, 2 ; VLS-NEXT: vslideup.vi v16, v9, 2 +; VLS-NEXT: vslideup.vi v17, v11, 2 ; VLS-NEXT: vmv4r.v v8, v16 ; VLS-NEXT: ret %ab = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll index b6267bf481c85..4839763c05028 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave2.ll @@ -1379,17 +1379,17 @@ define <16 x i64> @unzip2a_dual_v16i64(<16 x i64> %a, <16 x i64> %b) { ; V-LABEL: unzip2a_dual_v16i64: ; V: # %bb.0: # %entry ; V-NEXT: lui a0, 5 -; V-NEXT: vsetivli zero, 16, e16, m1, ta, ma -; V-NEXT: vid.v v16 ; V-NEXT: addi a0, a0, 1365 +; V-NEXT: vsetivli zero, 16, e16, m1, ta, ma ; V-NEXT: vmv.s.x v20, a0 ; V-NEXT: li a0, -256 -; V-NEXT: vadd.vv v21, v16, v16 +; V-NEXT: vid.v v21 ; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma ; V-NEXT: vcompress.vm v16, v8, v20 ; V-NEXT: vmv.s.x v0, a0 ; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; V-NEXT: vadd.vi v8, v21, -16 +; V-NEXT: vadd.vv v8, v21, v21 +; V-NEXT: vadd.vi v8, v8, -16 ; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu ; V-NEXT: vrgatherei16.vv v16, v12, v8, v0.t ; V-NEXT: vmv.v.v v8, v16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll index 9629b3547b3d0..e2dc5d7898ff2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll @@ -69,27 +69,26 @@ define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) { define <8 x i32> @m2_pair_swap_vl8(<8 x i32> %v1) vscale_range(2,2) { ; RV32-LABEL: m2_pair_swap_vl8: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v12, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v12, v10, a0 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: m2_pair_swap_vl8: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsrl.vv v12, v8, v10 +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret %res = shufflevector <8 x i32> %v1, <8 x i32> poison, <8 x i32> ret <8 x i32> %res @@ -183,17 +182,17 @@ define void @shuffle1(ptr %explicit_0, ptr %explicit_1) vscale_range(2,2) { ; CHECK: # %bb.0: ; CHECK-NEXT: addi a0, a0, 252 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmv.v.i v0, 1 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v10, v10, 1, v0.t +; CHECK-NEXT: vslidedown.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmv.v.i v0, 5 ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.i v10, 0 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 +; CHECK-NEXT: vmerge.vvm v11, v11, v8, v0 ; CHECK-NEXT: addi a0, a1, 672 -; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vs2r.v v10, (a0) ; CHECK-NEXT: ret %1 = getelementptr i32, ptr %explicit_0, i64 63 %2 = load <3 x i32>, ptr %1, align 1 @@ -227,11 +226,12 @@ define <16 x float> @shuffle2(<4 x float> %a) vscale_range(2,2) { define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) vscale_range(2,2) { ; RV32-LABEL: extract_any_extend_vector_inreg_v16i64: ; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 1 ; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vmv.v.i v0, 1 ; RV32-NEXT: li a1, 32 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu ; RV32-NEXT: vslidedown.vi v18, v15, 1, v0.t ; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma ; RV32-NEXT: vslidedown.vx v8, v16, a0 @@ -337,21 +337,19 @@ entry: define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) { ; RV32-LABEL: multi_chunks_shuffle: ; RV32: # %bb.0: # %entry -; RV32-NEXT: vsetivli zero, 16, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: li a0, 32 ; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v12, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: vsetivli zero, 16, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 ; RV32-NEXT: lui a0, 61681 ; RV32-NEXT: addi a0, a0, -241 +; RV32-NEXT: vand.vx v12, v10, a1 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vsll.vv v12, v8, v12 ; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v12 +; RV32-NEXT: vsrl.vv v8, v8, v10 ; RV32-NEXT: vmv.s.x v0, a0 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 ; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 @@ -367,12 +365,13 @@ define i64 @multi_chunks_shuffle(<32 x i32> %0) vscale_range(8,8) { ; RV64: # %bb.0: # %entry ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 16, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: vmv.v.x v10, a0 ; RV64-NEXT: lui a0, 61681 ; RV64-NEXT: addi a0, a0, -241 -; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vsrl.vv v12, v8, v10 +; RV64-NEXT: vsll.vv v8, v8, v10 ; RV64-NEXT: vmv.s.x v0, a0 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; RV64-NEXT: vmv.v.i v10, 0 ; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 @@ -431,21 +430,23 @@ define void @shuffle_i256_ldst(ptr %p) vscale_range(2,2) { define void @shuffle_3_input_vectors() vscale_range(4,4) { ; CHECK-LABEL: shuffle_3_input_vectors: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 1 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 6 ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmv.v.i v16, 1 +; CHECK-NEXT: vmv1r.v v18, v12 ; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, mu -; CHECK-NEXT: vslidedown.vi v20, v8, 1, v0.t -; CHECK-NEXT: vslideup.vi v20, v9, 3 -; CHECK-NEXT: vslidedown.vi v21, v9, 1 -; CHECK-NEXT: vmv1r.v v22, v8 +; CHECK-NEXT: vslidedown.vi v18, v16, 1, v0.t +; CHECK-NEXT: vmv8r.v v24, v8 +; CHECK-NEXT: vslideup.vi v18, v17, 3 +; CHECK-NEXT: vmv.v.v v28, v18 +; CHECK-NEXT: vslidedown.vi v29, v17, 1 +; CHECK-NEXT: vmv1r.v v30, v16 ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vmsgt.vi v8, v16, 0 +; CHECK-NEXT: vmslt.vv v16, v8, v24 ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; CHECK-NEXT: vmv.x.s a0, v8 +; CHECK-NEXT: vmv.x.s a0, v16 ; CHECK-NEXT: sb a0, 0(zero) ; CHECK-NEXT: ret %1 = shufflevector <32 x i64> zeroinitializer, <32 x i64> splat (i64 1), <32 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll index f580b1b993395..ad87dd107e7e8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll @@ -46,7 +46,8 @@ define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) { ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 9 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 +; CHECK-NEXT: vfmv.v.f v10, fa5 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x double> , <4 x double> %x, <4 x i32> ret <4 x double> %s @@ -60,7 +61,8 @@ define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) { ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 6 ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 +; CHECK-NEXT: vfmv.v.f v10, fa5 +; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s @@ -69,10 +71,11 @@ define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) { define <4 x float> @vfmerge_constant_v4f32(<4 x float> %x) { ; CHECK-LABEL: vfmerge_constant_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv.v.i v0, 6 ; CHECK-NEXT: lui a0, 264704 -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.i v0, 6 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vrgather.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %s = shufflevector <4 x float> %x, <4 x float> , <4 x i32> ret <4 x float> %s @@ -85,8 +88,9 @@ define <4 x double> @vfmerge_constant_v4f64(<4 x double> %x) { ; CHECK-NEXT: fld fa5, %lo(.LCPI6_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; CHECK-NEXT: vfmv.v.f v10, fa5 +; CHECK-NEXT: vrgather.vi v8, v10, 1, v0.t ; CHECK-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s @@ -96,11 +100,13 @@ define <8 x float> @vmerge_vxm(<8 x float> %v, float %s) { ; CHECK-LABEL: vmerge_vxm: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 25 -; CHECK-NEXT: vsetivli zero, 1, e32, m4, tu, ma -; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 8, e32, m1, tu, ma ; CHECK-NEXT: vfmv.s.f v8, fa0 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vmv2r.v v10, v8 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 ; CHECK-NEXT: ret %ins = insertelement <8 x float> %v, float %s, i32 0 %shuf = shufflevector <8 x float> %ins, <8 x float> poison, <8 x i32> @@ -110,10 +116,15 @@ define <8 x float> @vmerge_vxm(<8 x float> %v, float %s) { define <8 x float> @vmerge_vxm2(<8 x float> %v, float %s) { ; CHECK-LABEL: vmerge_vxm2: ; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 1, e32, m4, tu, ma +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vmv2r.v v10, v8 ; CHECK-NEXT: li a0, 25 -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vfmv.s.f v12, fa0 ; CHECK-NEXT: vmv.s.x v0, a0 -; CHECK-NEXT: vfmerge.vfm v8, v8, fa0, v0 +; CHECK-NEXT: vmv1r.v v10, v12 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v10, 0, v0.t ; CHECK-NEXT: ret %ins = insertelement <8 x float> %v, float %s, i32 0 %shuf = shufflevector <8 x float> %v, <8 x float> %ins, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll index 917613d5c786f..9676987016e0a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int-interleave.ll @@ -967,17 +967,19 @@ define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) { ; CHECK-LABEL: unary_interleave_10uu_v4i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 -; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.i v9, 8 +; CHECK-NEXT: vsrl.vv v10, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret ; ; ZIP-LABEL: unary_interleave_10uu_v4i8: ; ZIP: # %bb.0: ; ZIP-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZIP-NEXT: vsrl.vi v9, v8, 8 -; ZIP-NEXT: vsll.vi v8, v8, 8 -; ZIP-NEXT: vor.vv v8, v8, v9 +; ZIP-NEXT: vmv.v.i v9, 8 +; ZIP-NEXT: vsrl.vv v10, v8, v9 +; ZIP-NEXT: vsll.vv v8, v8, v9 +; ZIP-NEXT: vor.vv v8, v8, v10 ; ZIP-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> ret <4 x i8> %a diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll index 8676803e20e3b..73cc77585a9ba 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-int.ll @@ -30,7 +30,8 @@ define <4 x i16> @shuffle_xv_v4i16(<4 x i16> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 9 -; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> , <4 x i16> %x, <4 x i32> ret <4 x i16> %s @@ -41,7 +42,8 @@ define <4 x i16> @shuffle_vx_v4i16(<4 x i16> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s @@ -209,14 +211,15 @@ define <8 x i64> @vrgather_shuffle_xv_v8i64(<8 x i64> %x) { ; ; RV64-LABEL: vrgather_shuffle_xv_v8i64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vmv.v.i v12, -1 ; RV64-NEXT: li a0, 113 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.s.x v0, a0 ; RV64-NEXT: lui a0, 98305 ; RV64-NEXT: slli a0, a0, 6 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vmv.v.x v16, a0 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vmv.v.i v12, -1 ; RV64-NEXT: vrgatherei16.vv v12, v8, v16, v0.t ; RV64-NEXT: vmv.v.v v8, v12 ; RV64-NEXT: ret @@ -1325,7 +1328,8 @@ define <4 x i16> @vmerge_1(<4 x i16> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s @@ -1346,9 +1350,10 @@ define <4 x i16> @vmerge_2(<4 x i16> %x) { define <4 x i16> @vmerge_3(<4 x i16> %x) { ; CHECK-LABEL: vmerge_3: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vmerge.vim v8, v8, 5, v0 +; CHECK-NEXT: vmv.v.i v9, 5 +; CHECK-NEXT: vrgather.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %s = shufflevector <4 x i16> %x, <4 x i16> , <4 x i32> ret <4 x i16> %s @@ -1448,11 +1453,13 @@ define <8 x i8> @vmerge_vxm(<8 x i8> %v, i8 %s) { ; CHECK-LABEL: vmerge_vxm: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 25 -; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, ma -; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, tu, ma ; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v9, v8, 0, v0.t +; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %ins = insertelement <8 x i8> %v, i8 %s, i32 0 %shuf = shufflevector <8 x i8> %ins, <8 x i8> poison, <8 x i32> @@ -1463,9 +1470,12 @@ define <8 x i8> @vmerge_vxm2(<8 x i8> %v, i8 %s) { ; CHECK-LABEL: vmerge_vxm2: ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 25 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 1, e8, m1, tu, ma ; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vmv.s.x v9, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vrgather.vi v8, v9, 0, v0.t ; CHECK-NEXT: ret %ins = insertelement <8 x i8> %v, i8 %s, i32 0 %shuf = shufflevector <8 x i8> %v, <8 x i8> %ins, <8 x i32> @@ -1478,7 +1488,8 @@ define <8 x i8> @vmerge_vxm3(<8 x i8> %v, i8 %s) { ; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vmerge.vvm v8, v8, v9, v0 ; CHECK-NEXT: ret %ins = insertelement <8 x i8> %v, i8 %s, i32 0 %splat = shufflevector <8 x i8> %ins, <8 x i8> poison, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index fe2072990e2ac..25828d1b5d29f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -10,9 +10,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 1 -; CHECK-NEXT: vsll.vi v9, v0, 7 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 7 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vsrl.vi v9, v0, 1 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_1: @@ -34,9 +35,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_2(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_2: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 2 -; CHECK-NEXT: vsll.vi v9, v0, 6 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 6 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vsrl.vi v9, v0, 2 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_2: @@ -58,9 +60,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_3(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_3: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 3 -; CHECK-NEXT: vsll.vi v9, v0, 5 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 5 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vsrl.vi v9, v0, 3 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_3: @@ -82,9 +85,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_4(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_4: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 4 -; CHECK-NEXT: vsll.vi v9, v0, 4 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 4 +; CHECK-NEXT: vsrl.vv v9, v0, v8 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_4: @@ -106,9 +110,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_5(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_5: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 5 -; CHECK-NEXT: vsll.vi v9, v0, 3 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vsrl.vi v9, v0, 5 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_5: @@ -130,9 +135,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_6(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_6: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 6 -; CHECK-NEXT: vsll.vi v9, v0, 2 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 2 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vsrl.vi v9, v0, 6 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_6: @@ -154,9 +160,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_7(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_7: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vsrl.vi v8, v0, 7 -; CHECK-NEXT: vadd.vv v9, v0, v0 -; CHECK-NEXT: vor.vv v0, v9, v8 +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsll.vv v8, v0, v8 +; CHECK-NEXT: vsrl.vi v9, v0, 7 +; CHECK-NEXT: vor.vv v0, v8, v9 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i1_as_i8_7: @@ -178,9 +185,10 @@ define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 -; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.i v9, 8 +; CHECK-NEXT: vsrl.vv v10, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i8_as_i16: @@ -201,10 +209,12 @@ define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) { define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i32_8: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 24 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 8 -; CHECK-NEXT: vsll.vi v8, v8, 24 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i8_as_i32_8: @@ -225,10 +235,12 @@ define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) { define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i32_16: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 16 -; CHECK-NEXT: vsll.vi v8, v8, 16 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i8_as_i32_16: @@ -250,9 +262,10 @@ define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i32_24: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 24 -; CHECK-NEXT: vsll.vi v8, v8, 8 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.i v9, 8 +; CHECK-NEXT: vsll.vv v9, v8, v9 +; CHECK-NEXT: vsrl.vi v8, v8, 24 +; CHECK-NEXT: vor.vv v8, v9, v8 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i8_as_i32_24: @@ -455,10 +468,12 @@ define <8 x i8> @shuffle_v8i8_as_i64_56(<8 x i8> %v) { define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) { ; CHECK-LABEL: shuffle_v8i16_as_i32: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 16 -; CHECK-NEXT: vsll.vi v8, v8, 16 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i16_as_i32: @@ -479,25 +494,24 @@ define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) { define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { ; RV32-LABEL: shuffle_v8i16_as_i64_16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 48 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v9, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v9, v9, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_16: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 48 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vsll.vx v9, v8, a0 +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsll.vv v9, v8, v9 ; RV64-NEXT: vsrl.vi v8, v8, 16 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: ret @@ -524,27 +538,26 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { ; RV32-LABEL: shuffle_v8i16_as_i64_32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v9, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v9, v9, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_32: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsrl.vv v10, v8, v9 +; RV64-NEXT: vsll.vv v8, v8, v9 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i16_as_i64_32: @@ -569,18 +582,16 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { ; RV32-LABEL: shuffle_v8i16_as_i64_48: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 16 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v9, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v9, v9, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i16_as_i64_48: @@ -588,7 +599,9 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { ; RV64-NEXT: li a0, 48 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vsll.vi v8, v8, 16 +; RV64-NEXT: li a0, 16 +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsll.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: ret ; @@ -614,27 +627,26 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { ; RV32-LABEL: shuffle_v8i32_as_i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v12, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v12, v10, a0 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8i32_as_i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsrl.vv v12, v8, v10 +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8i32_as_i64: @@ -659,10 +671,12 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) { ; CHECK-LABEL: shuffle_v8f16_as_i32: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vsrl.vi v9, v8, 16 -; CHECK-NEXT: vsll.vi v8, v8, 16 -; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsrl.vv v10, v8, v9 +; CHECK-NEXT: vsll.vv v8, v8, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8f16_as_i32: @@ -683,25 +697,24 @@ define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) { define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { ; RV32-LABEL: shuffle_v8f16_as_i64_16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 48 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v9, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v9, v9, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_16: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 48 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vsll.vx v9, v8, a0 +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsll.vv v9, v8, v9 ; RV64-NEXT: vsrl.vi v8, v8, 16 ; RV64-NEXT: vor.vv v8, v9, v8 ; RV64-NEXT: ret @@ -728,27 +741,26 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { ; RV32-LABEL: shuffle_v8f16_as_i64_32: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v9, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v9, v9, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_32: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: vmv.v.x v9, a0 +; RV64-NEXT: vsrl.vv v10, v8, v9 +; RV64-NEXT: vsll.vv v8, v8, v9 +; RV64-NEXT: vor.vv v8, v8, v10 ; RV64-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8f16_as_i64_32: @@ -773,18 +785,16 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { ; RV32-LABEL: shuffle_v8f16_as_i64_48: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vmv.v.i v9, 0 ; RV32-NEXT: li a0, 16 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v9, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v9, v9, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f16_as_i64_48: @@ -792,7 +802,9 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { ; RV64-NEXT: li a0, 48 ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vsrl.vx v9, v8, a0 -; RV64-NEXT: vsll.vi v8, v8, 16 +; RV64-NEXT: li a0, 16 +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsll.vv v8, v8, v10 ; RV64-NEXT: vor.vv v8, v8, v9 ; RV64-NEXT: ret ; @@ -818,27 +830,26 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { ; RV32-LABEL: shuffle_v8f32_as_i64: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v12, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v12, v10, a0 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f32_as_i64: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsrl.vv v12, v8, v10 +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8f32_as_i64: @@ -863,27 +874,26 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { define <8 x float> @shuffle_v8f32_as_i64_exact(<8 x float> %v) vscale_range(2,2) { ; RV32-LABEL: shuffle_v8f32_as_i64_exact: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmv.v.i v12, 0 ; RV32-NEXT: li a0, 32 -; RV32-NEXT: li a1, 63 -; RV32-NEXT: vwsubu.vx v10, v12, a0 -; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vmv.v.x v12, a0 -; RV32-NEXT: vand.vx v10, v10, a1 -; RV32-NEXT: vand.vx v12, v12, a1 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v12, v10, a0 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: shuffle_v8f32_as_i64_exact: ; RV64: # %bb.0: ; RV64-NEXT: li a0, 32 ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vsrl.vx v10, v8, a0 -; RV64-NEXT: vsll.vx v8, v8, a0 -; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsrl.vv v12, v8, v10 +; RV64-NEXT: vsll.vv v8, v8, v10 +; RV64-NEXT: vor.vv v8, v8, v12 ; RV64-NEXT: ret ; ; ZVKB-V-LABEL: shuffle_v8f32_as_i64_exact: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll index 74f2cec04f0de..804e2a14271a6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-store.ll @@ -132,10 +132,10 @@ define void @store_constant_v2i32(ptr %p) { ; CHECK-LABEL: store_constant_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vmv.v.i v9, 3 ; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: vmadd.vx v9, a1, v8 +; CHECK-NEXT: vmacc.vx v9, a1, v8 ; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret store <2 x i32> , ptr %p @@ -197,10 +197,10 @@ define void @store_constant_v2i8_align1(ptr %p) { ; CHECK-LABEL: store_constant_v2i8_align1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vmv.v.i v9, 3 ; CHECK-NEXT: li a1, 3 -; CHECK-NEXT: vmadd.vx v9, a1, v8 +; CHECK-NEXT: vmacc.vx v9, a1, v8 ; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: ret store <2 x i8> , ptr %p, align 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll index 07aa05f609c40..5de01f6db31a3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll @@ -62,10 +62,10 @@ define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture reado ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB1_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma ; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t ; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0 ; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vse8.v v9, (a0) ; CHECK-NEXT: addi a0, a0, 32 @@ -137,22 +137,58 @@ for.cond.cleanup: ; preds = %vector.body } define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { -; CHECK-LABEL: gather_zero_stride: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi a2, a0, 1024 -; CHECK-NEXT: li a3, 32 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma -; CHECK-NEXT: .LBB3_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lbu a3, 0(a1) -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a3 -; CHECK-NEXT: vse8.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 32 -; CHECK-NEXT: addi a1, a1, 160 -; CHECK-NEXT: bne a0, a2, .LBB3_1 -; CHECK-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-NEXT: ret +; V-LABEL: gather_zero_stride: +; V: # %bb.0: # %entry +; V-NEXT: addi a2, a0, 1024 +; V-NEXT: li a3, 32 +; V-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; V-NEXT: .LBB3_1: # %vector.body +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: lbu a3, 0(a1) +; V-NEXT: vle8.v v8, (a0) +; V-NEXT: vmv.v.x v9, a3 +; V-NEXT: vadd.vv v8, v8, v9 +; V-NEXT: vse8.v v8, (a0) +; V-NEXT: addi a0, a0, 32 +; V-NEXT: addi a1, a1, 160 +; V-NEXT: bne a0, a2, .LBB3_1 +; V-NEXT: # %bb.2: # %for.cond.cleanup +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_zero_stride: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: addi a2, a0, 1024 +; ZVE32F-NEXT: li a3, 32 +; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; ZVE32F-NEXT: .LBB3_1: # %vector.body +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: lbu a3, 0(a1) +; ZVE32F-NEXT: vle8.v v8, (a0) +; ZVE32F-NEXT: vmv.v.x v9, a3 +; ZVE32F-NEXT: vadd.vv v8, v8, v9 +; ZVE32F-NEXT: vse8.v v8, (a0) +; ZVE32F-NEXT: addi a0, a0, 32 +; ZVE32F-NEXT: addi a1, a1, 160 +; ZVE32F-NEXT: bne a0, a2, .LBB3_1 +; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup +; ZVE32F-NEXT: ret +; +; OPTIMIZED-LABEL: gather_zero_stride: +; OPTIMIZED: # %bb.0: # %entry +; OPTIMIZED-NEXT: addi a2, a0, 1024 +; OPTIMIZED-NEXT: li a3, 32 +; OPTIMIZED-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; OPTIMIZED-NEXT: .LBB3_1: # %vector.body +; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 +; OPTIMIZED-NEXT: vlse8.v v8, (a1), zero +; OPTIMIZED-NEXT: vle8.v v9, (a0) +; OPTIMIZED-NEXT: vadd.vv v8, v9, v8 +; OPTIMIZED-NEXT: vse8.v v8, (a0) +; OPTIMIZED-NEXT: addi a0, a0, 32 +; OPTIMIZED-NEXT: addi a1, a1, 160 +; OPTIMIZED-NEXT: bne a0, a2, .LBB3_1 +; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup +; OPTIMIZED-NEXT: ret entry: br label %vector.body @@ -176,21 +212,55 @@ for.cond.cleanup: ; preds = %vector.body } define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { -; CHECK-LABEL: gather_zero_stride_i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi a2, a0, 1024 -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: .LBB4_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a3, 0(a1) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vadd.vx v8, v8, a3 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 8 -; CHECK-NEXT: addi a1, a1, 160 -; CHECK-NEXT: bne a0, a2, .LBB4_1 -; CHECK-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-NEXT: ret +; V-LABEL: gather_zero_stride_i32: +; V: # %bb.0: # %entry +; V-NEXT: addi a2, a0, 1024 +; V-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V-NEXT: .LBB4_1: # %vector.body +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: lw a3, 0(a1) +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vmv.v.x v9, a3 +; V-NEXT: vadd.vv v8, v8, v9 +; V-NEXT: vse32.v v8, (a0) +; V-NEXT: addi a0, a0, 8 +; V-NEXT: addi a1, a1, 160 +; V-NEXT: bne a0, a2, .LBB4_1 +; V-NEXT: # %bb.2: # %for.cond.cleanup +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_zero_stride_i32: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: addi a2, a0, 1024 +; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; ZVE32F-NEXT: .LBB4_1: # %vector.body +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: lw a3, 0(a1) +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vmv.v.x v9, a3 +; ZVE32F-NEXT: vadd.vv v8, v8, v9 +; ZVE32F-NEXT: vse32.v v8, (a0) +; ZVE32F-NEXT: addi a0, a0, 8 +; ZVE32F-NEXT: addi a1, a1, 160 +; ZVE32F-NEXT: bne a0, a2, .LBB4_1 +; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup +; ZVE32F-NEXT: ret +; +; OPTIMIZED-LABEL: gather_zero_stride_i32: +; OPTIMIZED: # %bb.0: # %entry +; OPTIMIZED-NEXT: addi a2, a0, 1024 +; OPTIMIZED-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; OPTIMIZED-NEXT: .LBB4_1: # %vector.body +; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 +; OPTIMIZED-NEXT: vlse32.v v8, (a1), zero +; OPTIMIZED-NEXT: vle32.v v9, (a0) +; OPTIMIZED-NEXT: vadd.vv v8, v9, v8 +; OPTIMIZED-NEXT: vse32.v v8, (a0) +; OPTIMIZED-NEXT: addi a0, a0, 8 +; OPTIMIZED-NEXT: addi a1, a1, 160 +; OPTIMIZED-NEXT: bne a0, a2, .LBB4_1 +; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup +; OPTIMIZED-NEXT: ret entry: br label %vector.body @@ -344,12 +414,12 @@ define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture read ; CHECK-NEXT: li a4, 5 ; CHECK-NEXT: .LBB7_1: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu -; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t +; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, ma +; CHECK-NEXT: vlse8.v v9, (a0), a4, v0.t +; CHECK-NEXT: vle8.v v10, (a1) ; CHECK-NEXT: addi a1, a1, 32 -; CHECK-NEXT: vadd.vv v9, v10, v9 +; CHECK-NEXT: vmerge.vvm v9, v8, v9, v0 +; CHECK-NEXT: vadd.vv v9, v9, v10 ; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t ; CHECK-NEXT: addi a0, a0, 160 ; CHECK-NEXT: bne a1, a2, .LBB7_1 @@ -1064,22 +1134,58 @@ bb16: ; preds = %bb4, %bb } define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { -; CHECK-LABEL: gather_zero_stride_fp: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a2, 1 -; CHECK-NEXT: add a2, a0, a2 -; CHECK-NEXT: vsetivli zero, 8, e32, m1, ta, ma -; CHECK-NEXT: .LBB16_1: # %vector.body -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: flw fa5, 0(a1) -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfadd.vf v8, v8, fa5 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: addi a1, a1, 640 -; CHECK-NEXT: bne a0, a2, .LBB16_1 -; CHECK-NEXT: # %bb.2: # %for.cond.cleanup -; CHECK-NEXT: ret +; V-LABEL: gather_zero_stride_fp: +; V: # %bb.0: # %entry +; V-NEXT: lui a2, 1 +; V-NEXT: add a2, a0, a2 +; V-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; V-NEXT: .LBB16_1: # %vector.body +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: flw fa5, 0(a1) +; V-NEXT: vle32.v v8, (a0) +; V-NEXT: vfmv.v.f v9, fa5 +; V-NEXT: vfadd.vv v8, v8, v9 +; V-NEXT: vse32.v v8, (a0) +; V-NEXT: addi a0, a0, 128 +; V-NEXT: addi a1, a1, 640 +; V-NEXT: bne a0, a2, .LBB16_1 +; V-NEXT: # %bb.2: # %for.cond.cleanup +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_zero_stride_fp: +; ZVE32F: # %bb.0: # %entry +; ZVE32F-NEXT: lui a2, 1 +; ZVE32F-NEXT: add a2, a0, a2 +; ZVE32F-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; ZVE32F-NEXT: .LBB16_1: # %vector.body +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: flw fa5, 0(a1) +; ZVE32F-NEXT: vle32.v v8, (a0) +; ZVE32F-NEXT: vfmv.v.f v9, fa5 +; ZVE32F-NEXT: vfadd.vv v8, v8, v9 +; ZVE32F-NEXT: vse32.v v8, (a0) +; ZVE32F-NEXT: addi a0, a0, 128 +; ZVE32F-NEXT: addi a1, a1, 640 +; ZVE32F-NEXT: bne a0, a2, .LBB16_1 +; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup +; ZVE32F-NEXT: ret +; +; OPTIMIZED-LABEL: gather_zero_stride_fp: +; OPTIMIZED: # %bb.0: # %entry +; OPTIMIZED-NEXT: lui a2, 1 +; OPTIMIZED-NEXT: add a2, a0, a2 +; OPTIMIZED-NEXT: vsetivli zero, 8, e32, m1, ta, ma +; OPTIMIZED-NEXT: .LBB16_1: # %vector.body +; OPTIMIZED-NEXT: # =>This Inner Loop Header: Depth=1 +; OPTIMIZED-NEXT: vlse32.v v8, (a1), zero +; OPTIMIZED-NEXT: vle32.v v9, (a0) +; OPTIMIZED-NEXT: vfadd.vv v8, v9, v8 +; OPTIMIZED-NEXT: vse32.v v8, (a0) +; OPTIMIZED-NEXT: addi a0, a0, 128 +; OPTIMIZED-NEXT: addi a1, a1, 640 +; OPTIMIZED-NEXT: bne a0, a2, .LBB16_1 +; OPTIMIZED-NEXT: # %bb.2: # %for.cond.cleanup +; OPTIMIZED-NEXT: ret entry: br label %vector.body diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 4b7f82f94f5e4..6bfafc02654bd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -755,12 +755,12 @@ define <4 x i64> @zero_strided_vadd.vx(<4 x i64> %v, ptr %ptr) { ; CHECK-RV32-NEXT: vadd.vv v8, v8, v10 ; CHECK-RV32-NEXT: ret ; -; CHECK-RV64-LABEL: zero_strided_vadd.vx: -; CHECK-RV64: # %bb.0: -; CHECK-RV64-NEXT: ld a0, 0(a0) -; CHECK-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-RV64-NEXT: vadd.vx v8, v8, a0 -; CHECK-RV64-NEXT: ret +; CHECK-OPT-LABEL: zero_strided_vadd.vx: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-OPT-NEXT: vlse64.v v10, (a0), zero +; CHECK-OPT-NEXT: vadd.vv v8, v8, v10 +; CHECK-OPT-NEXT: ret %load = call <4 x i64> @llvm.experimental.vp.strided.load.v4i64.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4) %w = add <4 x i64> %v, %load ret <4 x i64> %w diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll index 3b1dc298c12ce..6453a0dadc103 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-sat-clip.ll @@ -104,8 +104,10 @@ define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i16>, ptr %x, align 16 @@ -121,9 +123,11 @@ define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vmax.vx v8, v8, zero ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i16>, ptr %x, align 16 @@ -230,8 +234,11 @@ define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) { ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i32>, ptr %x, align 16 @@ -247,10 +254,13 @@ define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: li a0, 50 ; CHECK-NEXT: vmax.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i32>, ptr %x, align 16 @@ -358,9 +368,12 @@ define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vse32.v v10, (a1) ; CHECK-NEXT: ret %1 = load <4 x i64>, ptr %x, align 16 @@ -376,9 +389,12 @@ define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vmax.vx v8, v8, zero ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vse32.v v10, (a1) ; CHECK-NEXT: ret %1 = load <4 x i64>, ptr %x, align 16 @@ -448,10 +464,12 @@ define void @trunc_sat_u8u32_maxmin(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i32>, ptr %x, align 16 @@ -467,11 +485,13 @@ define void @trunc_sat_u8u32_minmax(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vmax.vx v8, v8, zero ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i32>, ptr %x, align 16 @@ -547,12 +567,14 @@ define void @trunc_sat_u8u64_maxmin(ptr %x, ptr %y) { ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i64>, ptr %x, align 16 @@ -568,13 +590,15 @@ define void @trunc_sat_u8u64_minmax(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vmax.vx v8, v8, zero ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i64>, ptr %x, align 16 @@ -666,10 +690,13 @@ define void @trunc_sat_u16u64_maxmin(ptr %x, ptr %y) { ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: li a0, 1 ; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i64>, ptr %x, align 16 @@ -685,12 +712,15 @@ define void @trunc_sat_u16u64_minmax(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: li a0, 50 ; CHECK-NEXT: vmax.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: vse16.v v8, (a1) ; CHECK-NEXT: ret %1 = load <4 x i64>, ptr %x, align 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll index dc432efbd5c47..34504a3cbe986 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vaaddu.ll @@ -68,9 +68,13 @@ define <8 x i8> @vaaddu_vv_v8i8_floor_zexti32(<8 x i8> %x, <8 x i8> %y) { define <8 x i8> @vaaddu_vv_v8i8_floor_lshr2(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: vaaddu_vv_v8i8_floor_lshr2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: vnsrl.wi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 +; CHECK-NEXT: vsrl.vi v8, v8, 2 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %xzv = zext <8 x i8> %x to <8 x i16> %yzv = zext <8 x i8> %y to <8 x i16> @@ -281,12 +285,14 @@ define <8 x i8> @vaaddu_vv_v8i8_ceil_zexti32(<8 x i8> %x, <8 x i8> %y) { define <8 x i8> @vaaddu_vv_v8i8_ceil_lshr2(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: vaaddu_vv_v8i8_ceil_lshr2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vadd.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 +; CHECK-NEXT: vadd.vi v8, v8, 2 +; CHECK-NEXT: vsrl.vi v8, v8, 2 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 2 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %xzv = zext <8 x i8> %x to <8 x i16> %yzv = zext <8 x i8> %y to <8 x i16> @@ -300,12 +306,13 @@ define <8 x i8> @vaaddu_vv_v8i8_ceil_lshr2(<8 x i8> %x, <8 x i8> %y) { define <8 x i8> @vaaddu_vv_v8i8_ceil_add2(<8 x i8> %x, <8 x i8> %y) { ; CHECK-LABEL: vaaddu_vv_v8i8_ceil_add2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: csrwi vxrm, 2 -; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-NEXT: vaaddu.vx v8, v10, a0 +; CHECK-NEXT: vaaddu.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vandn.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vandn.ll index ae7f8ed78aa06..3a2a21f6b572e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vandn.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vandn.ll @@ -17,7 +17,8 @@ define <8 x i8> @not_signbit_mask_v8i8(<8 x i8> %a, <8 x i8> %b) { ; CHECK-ZVKB: # %bb.0: ; CHECK-ZVKB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-ZVKB-NEXT: vsra.vi v8, v8, 7 -; CHECK-ZVKB-NEXT: vandn.vv v8, v9, v8 +; CHECK-ZVKB-NEXT: vnot.v v8, v8 +; CHECK-ZVKB-NEXT: vand.vv v8, v8, v9 ; CHECK-ZVKB-NEXT: ret %cond = icmp sgt <8 x i8> %a, splat (i8 -1) %r = select <8 x i1> %cond, <8 x i8> %b, <8 x i8> zeroinitializer @@ -37,7 +38,8 @@ define <4 x i16> @not_signbit_mask_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-ZVKB: # %bb.0: ; CHECK-ZVKB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-ZVKB-NEXT: vsra.vi v8, v8, 15 -; CHECK-ZVKB-NEXT: vandn.vv v8, v9, v8 +; CHECK-ZVKB-NEXT: vnot.v v8, v8 +; CHECK-ZVKB-NEXT: vand.vv v8, v8, v9 ; CHECK-ZVKB-NEXT: ret %cond = icmp sgt <4 x i16> %a, splat (i16 -1) %r = select <4 x i1> %cond, <4 x i16> %b, <4 x i16> zeroinitializer @@ -57,7 +59,8 @@ define <2 x i32> @not_signbit_mask_v2i32(<2 x i32> %a, <2 x i32> %b) { ; CHECK-ZVKB: # %bb.0: ; CHECK-ZVKB-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-ZVKB-NEXT: vsra.vi v8, v8, 31 -; CHECK-ZVKB-NEXT: vandn.vv v8, v9, v8 +; CHECK-ZVKB-NEXT: vnot.v v8, v8 +; CHECK-ZVKB-NEXT: vand.vv v8, v8, v9 ; CHECK-ZVKB-NEXT: ret %cond = icmp sgt <2 x i32> %a, splat (i32 -1) %r = select <2 x i1> %cond, <2 x i32> %b, <2 x i32> zeroinitializer @@ -78,7 +81,8 @@ define <2 x i64> @not_signbit_mask_v2i64(<2 x i64> %a, <2 x i64> %b) { ; CHECK-ZVKB-NEXT: li a0, 63 ; CHECK-ZVKB-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-ZVKB-NEXT: vsra.vx v8, v8, a0 -; CHECK-ZVKB-NEXT: vandn.vv v8, v9, v8 +; CHECK-ZVKB-NEXT: vnot.v v8, v8 +; CHECK-ZVKB-NEXT: vand.vv v8, v8, v9 ; CHECK-ZVKB-NEXT: ret %cond = icmp sgt <2 x i64> %a, splat (i64 -1) %r = select <2 x i1> %cond, <2 x i64> %b, <2 x i64> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll index dfd509062ccf7..74d3bc36ad74f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmp-constrained-sdnode.ll @@ -325,7 +325,8 @@ define <1 x i1> @fcmp_ueq_vv_v1f16(<1 x half> %va, <1 x half> %vb) nounwind stri ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f16(<1 x half> %va, <1 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -342,7 +343,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -361,7 +363,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -958,7 +961,8 @@ define <2 x i1> @fcmp_ueq_vv_v2f16(<2 x half> %va, <2 x half> %vb) nounwind stri ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f16(<2 x half> %va, <2 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -975,7 +979,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -994,7 +999,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1591,7 +1597,8 @@ define <4 x i1> @fcmp_ueq_vv_v4f16(<4 x half> %va, <4 x half> %vb) nounwind stri ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f16(<4 x half> %va, <4 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -1608,7 +1615,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1627,7 +1635,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -2224,7 +2233,8 @@ define <8 x i1> @fcmp_ueq_vv_v8f16(<8 x half> %va, <8 x half> %vb) nounwind stri ; CHECK-NEXT: vmv.v.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -2241,7 +2251,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2260,7 +2271,8 @@ define <8 x i1> @fcmp_ueq_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2885,7 +2897,8 @@ define <16 x i1> @fcmp_ueq_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmflt.vv v13, v8, v10, v0.t ; CHECK-NEXT: vmflt.vv v12, v10, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -2903,7 +2916,8 @@ define <16 x i1> @fcmp_ueq_vf_v16f16(<16 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v10, v11 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2923,7 +2937,8 @@ define <16 x i1> @fcmp_ueq_fv_v16f16(<16 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v10, v11 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -3544,9 +3559,9 @@ define <32 x i1> @fcmp_ord_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfeq.vf v16, v12, fa0 -; CHECK-NEXT: vmfeq.vv v12, v8, v8 -; CHECK-NEXT: vmand.mm v0, v12, v16 +; CHECK-NEXT: vmfeq.vv v16, v8, v8 +; CHECK-NEXT: vmfeq.vf v8, v12, fa0 +; CHECK-NEXT: vmand.mm v0, v16, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3560,9 +3575,9 @@ define <32 x i1> @fcmp_ord_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfeq.vf v16, v12, fa0 -; CHECK-NEXT: vmfeq.vv v12, v8, v8 -; CHECK-NEXT: vmand.mm v0, v16, v12 +; CHECK-NEXT: vmfeq.vv v16, v8, v8 +; CHECK-NEXT: vmfeq.vf v8, v12, fa0 +; CHECK-NEXT: vmand.mm v0, v8, v16 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3582,7 +3597,8 @@ define <32 x i1> @fcmp_ueq_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmflt.vv v17, v8, v12, v0.t ; CHECK-NEXT: vmflt.vv v16, v12, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v16, v17 +; CHECK-NEXT: vmor.mm v8, v16, v17 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <32 x i1> %1 @@ -3601,7 +3617,8 @@ define <32 x i1> @fcmp_ueq_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3622,7 +3639,8 @@ define <32 x i1> @fcmp_ueq_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3902,9 +3920,9 @@ define <32 x i1> @fcmp_uno_vf_v32f16(<32 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfne.vf v16, v12, fa0 -; CHECK-NEXT: vmfne.vv v12, v8, v8 -; CHECK-NEXT: vmor.mm v0, v12, v16 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vmfne.vf v8, v12, fa0 +; CHECK-NEXT: vmor.mm v0, v16, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3918,9 +3936,9 @@ define <32 x i1> @fcmp_uno_fv_v32f16(<32 x half> %va, half %b) nounwind strictfp ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfne.vf v16, v12, fa0 -; CHECK-NEXT: vmfne.vv v12, v8, v8 -; CHECK-NEXT: vmor.mm v0, v16, v12 +; CHECK-NEXT: vmfne.vv v16, v8, v8 +; CHECK-NEXT: vmfne.vf v8, v12, fa0 +; CHECK-NEXT: vmor.mm v0, v8, v16 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -4249,7 +4267,8 @@ define <1 x i1> @fcmp_ueq_vv_v1f32(<1 x float> %va, <1 x float> %vb) nounwind st ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f32(<1 x float> %va, <1 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -4266,7 +4285,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -4285,7 +4305,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -4882,7 +4903,8 @@ define <2 x i1> @fcmp_ueq_vv_v2f32(<2 x float> %va, <2 x float> %vb) nounwind st ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float> %va, <2 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -4899,7 +4921,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4918,7 +4941,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -5515,7 +5539,8 @@ define <4 x i1> @fcmp_ueq_vv_v4f32(<4 x float> %va, <4 x float> %vb) nounwind st ; CHECK-NEXT: vmv.v.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float> %va, <4 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -5532,7 +5557,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -5551,7 +5577,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -6176,7 +6203,8 @@ define <8 x i1> @fcmp_ueq_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind st ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmflt.vv v13, v8, v10, v0.t ; CHECK-NEXT: vmflt.vv v12, v10, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -6194,7 +6222,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v10, v11 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -6214,7 +6243,8 @@ define <8 x i1> @fcmp_ueq_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v10, v11 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -6851,7 +6881,8 @@ define <16 x i1> @fcmp_ueq_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwin ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmflt.vv v17, v8, v12, v0.t ; CHECK-NEXT: vmflt.vv v16, v12, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v16, v17 +; CHECK-NEXT: vmor.mm v8, v16, v17 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -6869,7 +6900,8 @@ define <16 x i1> @fcmp_ueq_vf_v16f32(<16 x float> %va, float %b) nounwind strict ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -6889,7 +6921,8 @@ define <16 x i1> @fcmp_ueq_fv_v16f32(<16 x float> %va, float %b) nounwind strict ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -7498,7 +7531,8 @@ define <1 x i1> @fcmp_ueq_vv_v1f64(<1 x double> %va, <1 x double> %vb) nounwind ; CHECK-NEXT: vmv.v.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmp.v1f64(<1 x double> %va, <1 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -7515,7 +7549,8 @@ define <1 x i1> @fcmp_ueq_vf_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -7534,7 +7569,8 @@ define <1 x i1> @fcmp_ueq_fv_v1f64(<1 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -8131,7 +8167,8 @@ define <2 x i1> @fcmp_ueq_vv_v2f64(<2 x double> %va, <2 x double> %vb) nounwind ; CHECK-NEXT: vmv.v.v v10, v0 ; CHECK-NEXT: vmflt.vv v10, v8, v9, v0.t ; CHECK-NEXT: vmflt.vv v0, v9, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v10 +; CHECK-NEXT: vmor.mm v8, v0, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double> %va, <2 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -8148,7 +8185,8 @@ define <2 x i1> @fcmp_ueq_vf_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmflt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -8167,7 +8205,8 @@ define <2 x i1> @fcmp_ueq_fv_v2f64(<2 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv.v.v v9, v0 ; CHECK-NEXT: vmfgt.vf v9, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v0, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v0, v9 +; CHECK-NEXT: vmor.mm v8, v0, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -8792,7 +8831,8 @@ define <4 x i1> @fcmp_ueq_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmflt.vv v13, v8, v10, v0.t ; CHECK-NEXT: vmflt.vv v12, v10, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -8810,7 +8850,8 @@ define <4 x i1> @fcmp_ueq_vf_v4f64(<4 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmflt.vf v11, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v10, v11 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -8830,7 +8871,8 @@ define <4 x i1> @fcmp_ueq_fv_v4f64(<4 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v10, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v10, v11 +; CHECK-NEXT: vmor.mm v8, v10, v11 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -9467,7 +9509,8 @@ define <8 x i1> @fcmp_ueq_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmflt.vv v17, v8, v12, v0.t ; CHECK-NEXT: vmflt.vv v16, v12, v8, v0.t -; CHECK-NEXT: vmnor.mm v0, v16, v17 +; CHECK-NEXT: vmor.mm v8, v16, v17 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -9485,7 +9528,8 @@ define <8 x i1> @fcmp_ueq_vf_v8f64(<8 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmflt.vf v13, v8, fa0, v0.t ; CHECK-NEXT: vmfgt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer @@ -9505,7 +9549,8 @@ define <8 x i1> @fcmp_ueq_fv_v8f64(<8 x double> %va, double %b) nounwind strictf ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmfgt.vf v13, v8, fa0, v0.t ; CHECK-NEXT: vmflt.vf v12, v8, fa0, v0.t -; CHECK-NEXT: vmnor.mm v0, v12, v13 +; CHECK-NEXT: vmor.mm v8, v12, v13 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll index 472f2073667db..ed08eca292d82 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfcmps-constrained-sdnode.ll @@ -269,7 +269,8 @@ define <1 x i1> @fcmps_ueq_vv_v1f16(<1 x half> %va, <1 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f16(<1 x half> %va, <1 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -281,7 +282,8 @@ define <1 x i1> @fcmps_ueq_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -295,7 +297,8 @@ define <1 x i1> @fcmps_ueq_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -457,7 +460,8 @@ define <1 x i1> @fcmps_une_vv_v1f16(<1 x half> %va, <1 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f16(<1 x half> %va, <1 x half> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -469,7 +473,8 @@ define <1 x i1> @fcmps_une_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -483,7 +488,8 @@ define <1 x i1> @fcmps_une_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -497,8 +503,9 @@ define <1 x i1> @fcmps_uno_vv_v1f16(<1 x half> %va, <1 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f16(<1 x half> %va, <1 x half> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -511,8 +518,9 @@ define <1 x i1> @fcmps_uno_vf_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -525,10 +533,11 @@ define <1 x i1> @fcmps_uno_fv_v1f16(<1 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %b, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -801,7 +810,8 @@ define <2 x i1> @fcmps_ueq_vv_v2f16(<2 x half> %va, <2 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(<2 x half> %va, <2 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -813,7 +823,8 @@ define <2 x i1> @fcmps_ueq_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -827,7 +838,8 @@ define <2 x i1> @fcmps_ueq_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -989,7 +1001,8 @@ define <2 x i1> @fcmps_une_vv_v2f16(<2 x half> %va, <2 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(<2 x half> %va, <2 x half> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -1001,7 +1014,8 @@ define <2 x i1> @fcmps_une_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1015,7 +1029,8 @@ define <2 x i1> @fcmps_une_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1029,8 +1044,9 @@ define <2 x i1> @fcmps_uno_vv_v2f16(<2 x half> %va, <2 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(<2 x half> %va, <2 x half> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -1043,8 +1059,9 @@ define <2 x i1> @fcmps_uno_vf_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1057,10 +1074,11 @@ define <2 x i1> @fcmps_uno_fv_v2f16(<2 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %b, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1333,7 +1351,8 @@ define <4 x i1> @fcmps_ueq_vv_v4f16(<4 x half> %va, <4 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(<4 x half> %va, <4 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -1345,7 +1364,8 @@ define <4 x i1> @fcmps_ueq_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1359,7 +1379,8 @@ define <4 x i1> @fcmps_ueq_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1521,7 +1542,8 @@ define <4 x i1> @fcmps_une_vv_v4f16(<4 x half> %va, <4 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(<4 x half> %va, <4 x half> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -1533,7 +1555,8 @@ define <4 x i1> @fcmps_une_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1547,7 +1570,8 @@ define <4 x i1> @fcmps_une_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1561,8 +1585,9 @@ define <4 x i1> @fcmps_uno_vv_v4f16(<4 x half> %va, <4 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(<4 x half> %va, <4 x half> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -1575,8 +1600,9 @@ define <4 x i1> @fcmps_uno_vf_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1589,10 +1615,11 @@ define <4 x i1> @fcmps_uno_fv_v4f16(<4 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %b, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1865,7 +1892,8 @@ define <8 x i1> @fcmps_ueq_vv_v8f16(<8 x half> %va, <8 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -1877,7 +1905,8 @@ define <8 x i1> @fcmps_ueq_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1891,7 +1920,8 @@ define <8 x i1> @fcmps_ueq_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2053,7 +2083,8 @@ define <8 x i1> @fcmps_une_vv_v8f16(<8 x half> %va, <8 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -2065,7 +2096,8 @@ define <8 x i1> @fcmps_une_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2079,7 +2111,8 @@ define <8 x i1> @fcmps_une_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2093,8 +2126,9 @@ define <8 x i1> @fcmps_uno_vv_v8f16(<8 x half> %va, <8 x half> %vb) nounwind str ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(<8 x half> %va, <8 x half> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -2107,8 +2141,9 @@ define <8 x i1> @fcmps_uno_vf_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2121,10 +2156,11 @@ define <8 x i1> @fcmps_uno_fv_v8f16(<8 x half> %va, half %b) nounwind strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %b, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2397,7 +2433,8 @@ define <16 x i1> @fcmps_ueq_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmflt.vv v12, v8, v10 ; CHECK-NEXT: vmflt.vv v13, v10, v8 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -2409,7 +2446,8 @@ define <16 x i1> @fcmps_ueq_vf_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmflt.vf v10, v8, fa0 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v11, v10 +; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2423,7 +2461,8 @@ define <16 x i1> @fcmps_ueq_fv_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmfgt.vf v10, v8, fa0 ; CHECK-NEXT: vmflt.vf v11, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v11, v10 +; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2585,7 +2624,8 @@ define <16 x i1> @fcmps_une_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmfle.vv v12, v10, v8 ; CHECK-NEXT: vmfle.vv v13, v8, v10 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -2597,7 +2637,8 @@ define <16 x i1> @fcmps_une_vf_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmfge.vf v10, v8, fa0 ; CHECK-NEXT: vmfle.vf v11, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v11, v10 +; CHECK-NEXT: vmand.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2611,7 +2652,8 @@ define <16 x i1> @fcmps_une_fv_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmfle.vf v10, v8, fa0 ; CHECK-NEXT: vmfge.vf v11, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v11, v10 +; CHECK-NEXT: vmand.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2625,8 +2667,9 @@ define <16 x i1> @fcmps_uno_vv_v16f16(<16 x half> %va, <16 x half> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vmfle.vv v12, v10, v10 ; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmnot.m v8, v12 +; CHECK-NEXT: vmnot.m v9, v10 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(<16 x half> %va, <16 x half> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -2639,8 +2682,9 @@ define <16 x i1> @fcmps_uno_vf_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vmfle.vv v12, v8, v8 ; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmnot.m v9, v12 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2653,10 +2697,11 @@ define <16 x i1> @fcmps_uno_fv_v16f16(<16 x half> %va, half %b) nounwind strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmnot.m v10, v12 -; CHECK-NEXT: vmfle.vv v11, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v10, v11 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %b, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -2918,9 +2963,9 @@ define <32 x i1> @fcmps_ord_vf_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmand.mm v0, v12, v16 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmand.mm v0, v16, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -2934,9 +2979,9 @@ define <32 x i1> @fcmps_ord_fv_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmand.mm v0, v16, v12 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmand.mm v0, v8, v16 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -2951,7 +2996,8 @@ define <32 x i1> @fcmps_ueq_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmflt.vv v16, v8, v12 ; CHECK-NEXT: vmflt.vv v17, v12, v8 -; CHECK-NEXT: vmnor.mm v0, v17, v16 +; CHECK-NEXT: vmor.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <32 x i1> %1 @@ -2964,7 +3010,8 @@ define <32 x i1> @fcmps_ueq_vf_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmflt.vf v12, v8, fa0 ; CHECK-NEXT: vmfgt.vf v13, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -2979,7 +3026,8 @@ define <32 x i1> @fcmps_ueq_fv_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmfgt.vf v12, v8, fa0 ; CHECK-NEXT: vmflt.vf v13, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3154,7 +3202,8 @@ define <32 x i1> @fcmps_une_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmfle.vv v16, v12, v8 ; CHECK-NEXT: vmfle.vv v17, v8, v12 -; CHECK-NEXT: vmnand.mm v0, v17, v16 +; CHECK-NEXT: vmand.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <32 x i1> %1 @@ -3167,7 +3216,8 @@ define <32 x i1> @fcmps_une_vf_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmfge.vf v12, v8, fa0 ; CHECK-NEXT: vmfle.vf v13, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3182,7 +3232,8 @@ define <32 x i1> @fcmps_une_fv_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmfle.vf v12, v8, fa0 ; CHECK-NEXT: vmfge.vf v13, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3197,8 +3248,9 @@ define <32 x i1> @fcmps_uno_vv_v32f16(<32 x half> %va, <32 x half> %vb) nounwind ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vmfle.vv v16, v12, v12 ; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmnot.m v8, v16 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %1 = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(<32 x half> %va, <32 x half> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <32 x i1> %1 @@ -3211,9 +3263,10 @@ define <32 x i1> @fcmps_uno_vf_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vmfle.vv v16, v8, v8 -; CHECK-NEXT: vmfle.vf v8, v12, fa0 -; CHECK-NEXT: vmnot.m v9, v16 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v16 +; CHECK-NEXT: vmfle.vf v9, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3228,9 +3281,10 @@ define <32 x i1> @fcmps_uno_fv_v32f16(<32 x half> %va, half %b) nounwind strictf ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vmfle.vv v16, v8, v8 -; CHECK-NEXT: vmfle.vf v8, v12, fa0 -; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmnot.m v8, v16 +; CHECK-NEXT: vmfle.vf v9, v12, fa0 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %b, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -3503,7 +3557,8 @@ define <1 x i1> @fcmps_ueq_vv_v1f32(<1 x float> %va, <1 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f32(<1 x float> %va, <1 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -3515,7 +3570,8 @@ define <1 x i1> @fcmps_ueq_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -3529,7 +3585,8 @@ define <1 x i1> @fcmps_ueq_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -3691,7 +3748,8 @@ define <1 x i1> @fcmps_une_vv_v1f32(<1 x float> %va, <1 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f32(<1 x float> %va, <1 x float> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -3703,7 +3761,8 @@ define <1 x i1> @fcmps_une_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -3717,7 +3776,8 @@ define <1 x i1> @fcmps_une_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -3731,8 +3791,9 @@ define <1 x i1> @fcmps_uno_vv_v1f32(<1 x float> %va, <1 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f32(<1 x float> %va, <1 x float> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -3745,8 +3806,9 @@ define <1 x i1> @fcmps_uno_vf_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -3759,10 +3821,11 @@ define <1 x i1> @fcmps_uno_fv_v1f32(<1 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %b, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -4035,7 +4098,8 @@ define <2 x i1> @fcmps_ueq_vv_v2f32(<2 x float> %va, <2 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %va, <2 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -4047,7 +4111,8 @@ define <2 x i1> @fcmps_ueq_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4061,7 +4126,8 @@ define <2 x i1> @fcmps_ueq_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4223,7 +4289,8 @@ define <2 x i1> @fcmps_une_vv_v2f32(<2 x float> %va, <2 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %va, <2 x float> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -4235,7 +4302,8 @@ define <2 x i1> @fcmps_une_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4249,7 +4317,8 @@ define <2 x i1> @fcmps_une_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4263,8 +4332,9 @@ define <2 x i1> @fcmps_uno_vv_v2f32(<2 x float> %va, <2 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float> %va, <2 x float> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -4277,8 +4347,9 @@ define <2 x i1> @fcmps_uno_vf_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4291,10 +4362,11 @@ define <2 x i1> @fcmps_uno_fv_v2f32(<2 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %b, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -4567,7 +4639,8 @@ define <4 x i1> @fcmps_ueq_vv_v4f32(<4 x float> %va, <4 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %va, <4 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -4579,7 +4652,8 @@ define <4 x i1> @fcmps_ueq_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -4593,7 +4667,8 @@ define <4 x i1> @fcmps_ueq_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -4755,7 +4830,8 @@ define <4 x i1> @fcmps_une_vv_v4f32(<4 x float> %va, <4 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %va, <4 x float> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -4767,7 +4843,8 @@ define <4 x i1> @fcmps_une_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -4781,7 +4858,8 @@ define <4 x i1> @fcmps_une_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -4795,8 +4873,9 @@ define <4 x i1> @fcmps_uno_vv_v4f32(<4 x float> %va, <4 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float> %va, <4 x float> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -4809,8 +4888,9 @@ define <4 x i1> @fcmps_uno_vf_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -4823,10 +4903,11 @@ define <4 x i1> @fcmps_uno_fv_v4f32(<4 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %b, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -5099,7 +5180,8 @@ define <8 x i1> @fcmps_ueq_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmflt.vv v12, v8, v10 ; CHECK-NEXT: vmflt.vv v13, v10, v8 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -5111,7 +5193,8 @@ define <8 x i1> @fcmps_ueq_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmflt.vf v10, v8, fa0 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v11, v10 +; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5125,7 +5208,8 @@ define <8 x i1> @fcmps_ueq_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfgt.vf v10, v8, fa0 ; CHECK-NEXT: vmflt.vf v11, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v11, v10 +; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5287,7 +5371,8 @@ define <8 x i1> @fcmps_une_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfle.vv v12, v10, v8 ; CHECK-NEXT: vmfle.vv v13, v8, v10 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -5299,7 +5384,8 @@ define <8 x i1> @fcmps_une_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfge.vf v10, v8, fa0 ; CHECK-NEXT: vmfle.vf v11, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v11, v10 +; CHECK-NEXT: vmand.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5313,7 +5399,8 @@ define <8 x i1> @fcmps_une_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfle.vf v10, v8, fa0 ; CHECK-NEXT: vmfge.vf v11, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v11, v10 +; CHECK-NEXT: vmand.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5327,8 +5414,9 @@ define <8 x i1> @fcmps_uno_vv_v8f32(<8 x float> %va, <8 x float> %vb) nounwind s ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfle.vv v12, v10, v10 ; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmnot.m v8, v12 +; CHECK-NEXT: vmnot.m v9, v10 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f32(<8 x float> %va, <8 x float> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -5341,8 +5429,9 @@ define <8 x i1> @fcmps_uno_vf_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vmfle.vv v12, v8, v8 ; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmnot.m v9, v12 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5355,10 +5444,11 @@ define <8 x i1> @fcmps_uno_fv_v8f32(<8 x float> %va, float %b) nounwind strictfp ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmnot.m v10, v12 -; CHECK-NEXT: vmfle.vv v11, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v10, v11 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %b, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -5631,7 +5721,8 @@ define <16 x i1> @fcmps_ueq_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwi ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vv v16, v8, v12 ; CHECK-NEXT: vmflt.vv v17, v12, v8 -; CHECK-NEXT: vmnor.mm v0, v17, v16 +; CHECK-NEXT: vmor.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -5643,7 +5734,8 @@ define <16 x i1> @fcmps_ueq_vf_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmflt.vf v12, v8, fa0 ; CHECK-NEXT: vmfgt.vf v13, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -5657,7 +5749,8 @@ define <16 x i1> @fcmps_ueq_fv_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfgt.vf v12, v8, fa0 ; CHECK-NEXT: vmflt.vf v13, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -5819,7 +5912,8 @@ define <16 x i1> @fcmps_une_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwi ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfle.vv v16, v12, v8 ; CHECK-NEXT: vmfle.vv v17, v8, v12 -; CHECK-NEXT: vmnand.mm v0, v17, v16 +; CHECK-NEXT: vmand.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -5831,7 +5925,8 @@ define <16 x i1> @fcmps_une_vf_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfge.vf v12, v8, fa0 ; CHECK-NEXT: vmfle.vf v13, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -5845,7 +5940,8 @@ define <16 x i1> @fcmps_une_fv_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfle.vf v12, v8, fa0 ; CHECK-NEXT: vmfge.vf v13, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -5859,8 +5955,9 @@ define <16 x i1> @fcmps_uno_vv_v16f32(<16 x float> %va, <16 x float> %vb) nounwi ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfle.vv v16, v12, v12 ; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmnot.m v8, v16 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %1 = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f32(<16 x float> %va, <16 x float> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <16 x i1> %1 @@ -5873,8 +5970,9 @@ define <16 x i1> @fcmps_uno_vf_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vmfle.vv v16, v8, v8 ; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmnot.m v9, v16 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -5887,10 +5985,11 @@ define <16 x i1> @fcmps_uno_fv_v16f32(<16 x float> %va, float %b) nounwind stric ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmnot.m v12, v16 -; CHECK-NEXT: vmfle.vv v13, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v12, v13 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %b, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -6163,7 +6262,8 @@ define <1 x i1> @fcmps_ueq_vv_v1f64(<1 x double> %va, <1 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %va, <1 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -6175,7 +6275,8 @@ define <1 x i1> @fcmps_ueq_vf_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -6189,7 +6290,8 @@ define <1 x i1> @fcmps_ueq_fv_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -6351,7 +6453,8 @@ define <1 x i1> @fcmps_une_vv_v1f64(<1 x double> %va, <1 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %va, <1 x double> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -6363,7 +6466,8 @@ define <1 x i1> @fcmps_une_vf_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -6377,7 +6481,8 @@ define <1 x i1> @fcmps_une_fv_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -6391,8 +6496,9 @@ define <1 x i1> @fcmps_uno_vv_v1f64(<1 x double> %va, <1 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <1 x i1> @llvm.experimental.constrained.fcmps.v1f64(<1 x double> %va, <1 x double> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <1 x i1> %1 @@ -6405,8 +6511,9 @@ define <1 x i1> @fcmps_uno_vf_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vfmv.s.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -6419,10 +6526,11 @@ define <1 x i1> @fcmps_uno_fv_v1f64(<1 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-NEXT: vfmv.s.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <1 x double> poison, double %b, i32 0 %splat = shufflevector <1 x double> %head, <1 x double> poison, <1 x i32> zeroinitializer @@ -6695,7 +6803,8 @@ define <2 x i1> @fcmps_ueq_vv_v2f64(<2 x double> %va, <2 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmflt.vv v10, v8, v9 ; CHECK-NEXT: vmflt.vv v8, v9, v8 -; CHECK-NEXT: vmnor.mm v0, v8, v10 +; CHECK-NEXT: vmor.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %va, <2 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -6707,7 +6816,8 @@ define <2 x i1> @fcmps_ueq_vf_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmflt.vf v9, v8, fa0 ; CHECK-NEXT: vmfgt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -6721,7 +6831,8 @@ define <2 x i1> @fcmps_ueq_fv_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfgt.vf v9, v8, fa0 ; CHECK-NEXT: vmflt.vf v8, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -6883,7 +6994,8 @@ define <2 x i1> @fcmps_une_vv_v2f64(<2 x double> %va, <2 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfle.vv v10, v9, v8 ; CHECK-NEXT: vmfle.vv v8, v8, v9 -; CHECK-NEXT: vmnand.mm v0, v8, v10 +; CHECK-NEXT: vmand.mm v8, v8, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %va, <2 x double> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -6895,7 +7007,8 @@ define <2 x i1> @fcmps_une_vf_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfge.vf v9, v8, fa0 ; CHECK-NEXT: vmfle.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -6909,7 +7022,8 @@ define <2 x i1> @fcmps_une_fv_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfle.vf v9, v8, fa0 ; CHECK-NEXT: vmfge.vf v8, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v8, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -6923,8 +7037,9 @@ define <2 x i1> @fcmps_uno_vv_v2f64(<2 x double> %va, <2 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfle.vv v9, v9, v9 ; CHECK-NEXT: vmfle.vv v8, v8, v8 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %1 = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double> %va, <2 x double> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <2 x i1> %1 @@ -6937,8 +7052,9 @@ define <2 x i1> @fcmps_uno_vf_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vfmv.v.f v9, fa0 ; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 +; CHECK-NEXT: vmnot.m v9, v9 ; CHECK-NEXT: vmnot.m v8, v8 -; CHECK-NEXT: vmorn.mm v0, v8, v9 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -6951,10 +7067,11 @@ define <2 x i1> @fcmps_uno_fv_v2f64(<2 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vmfle.vv v8, v8, v8 ; CHECK-NEXT: vmfle.vf v9, v9, fa0 ; CHECK-NEXT: vmnot.m v9, v9 -; CHECK-NEXT: vmfle.vv v8, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %b, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -7227,7 +7344,8 @@ define <4 x i1> @fcmps_ueq_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmflt.vv v12, v8, v10 ; CHECK-NEXT: vmflt.vv v13, v10, v8 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -7239,7 +7357,8 @@ define <4 x i1> @fcmps_ueq_vf_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmflt.vf v10, v8, fa0 ; CHECK-NEXT: vmfgt.vf v11, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v11, v10 +; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7253,7 +7372,8 @@ define <4 x i1> @fcmps_ueq_fv_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfgt.vf v10, v8, fa0 ; CHECK-NEXT: vmflt.vf v11, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v11, v10 +; CHECK-NEXT: vmor.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7415,7 +7535,8 @@ define <4 x i1> @fcmps_une_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfle.vv v12, v10, v8 ; CHECK-NEXT: vmfle.vv v13, v8, v10 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -7427,7 +7548,8 @@ define <4 x i1> @fcmps_une_vf_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfge.vf v10, v8, fa0 ; CHECK-NEXT: vmfle.vf v11, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v11, v10 +; CHECK-NEXT: vmand.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7441,7 +7563,8 @@ define <4 x i1> @fcmps_une_fv_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfle.vf v10, v8, fa0 ; CHECK-NEXT: vmfge.vf v11, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v11, v10 +; CHECK-NEXT: vmand.mm v8, v11, v10 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7455,8 +7578,9 @@ define <4 x i1> @fcmps_uno_vv_v4f64(<4 x double> %va, <4 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfle.vv v12, v10, v10 ; CHECK-NEXT: vmfle.vv v10, v8, v8 -; CHECK-NEXT: vmnot.m v8, v10 -; CHECK-NEXT: vmorn.mm v0, v8, v12 +; CHECK-NEXT: vmnot.m v8, v12 +; CHECK-NEXT: vmnot.m v9, v10 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %1 = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f64(<4 x double> %va, <4 x double> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <4 x i1> %1 @@ -7469,8 +7593,9 @@ define <4 x i1> @fcmps_uno_vf_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vmfle.vv v12, v8, v8 ; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmnot.m v9, v12 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7483,10 +7608,11 @@ define <4 x i1> @fcmps_uno_fv_v4f64(<4 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 -; CHECK-NEXT: vmfle.vf v12, v10, fa0 -; CHECK-NEXT: vmnot.m v10, v12 -; CHECK-NEXT: vmfle.vv v11, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v10, v11 +; CHECK-NEXT: vmfle.vv v12, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v10, fa0 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %b, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -7759,7 +7885,8 @@ define <8 x i1> @fcmps_ueq_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmflt.vv v16, v8, v12 ; CHECK-NEXT: vmflt.vv v17, v12, v8 -; CHECK-NEXT: vmnor.mm v0, v17, v16 +; CHECK-NEXT: vmor.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"ueq", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -7771,7 +7898,8 @@ define <8 x i1> @fcmps_ueq_vf_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmflt.vf v12, v8, fa0 ; CHECK-NEXT: vmfgt.vf v13, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer @@ -7785,7 +7913,8 @@ define <8 x i1> @fcmps_ueq_fv_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfgt.vf v12, v8, fa0 ; CHECK-NEXT: vmflt.vf v13, v8, fa0 -; CHECK-NEXT: vmnor.mm v0, v13, v12 +; CHECK-NEXT: vmor.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer @@ -7947,7 +8076,8 @@ define <8 x i1> @fcmps_une_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfle.vv v16, v12, v8 ; CHECK-NEXT: vmfle.vv v17, v8, v12 -; CHECK-NEXT: vmnand.mm v0, v17, v16 +; CHECK-NEXT: vmand.mm v8, v17, v16 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"une", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -7959,7 +8089,8 @@ define <8 x i1> @fcmps_une_vf_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfge.vf v12, v8, fa0 ; CHECK-NEXT: vmfle.vf v13, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer @@ -7973,7 +8104,8 @@ define <8 x i1> @fcmps_une_fv_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfle.vf v12, v8, fa0 ; CHECK-NEXT: vmfge.vf v13, v8, fa0 -; CHECK-NEXT: vmnand.mm v0, v13, v12 +; CHECK-NEXT: vmand.mm v8, v13, v12 +; CHECK-NEXT: vmnot.m v0, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer @@ -7987,8 +8119,9 @@ define <8 x i1> @fcmps_uno_vv_v8f64(<8 x double> %va, <8 x double> %vb) nounwind ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfle.vv v16, v12, v12 ; CHECK-NEXT: vmfle.vv v12, v8, v8 -; CHECK-NEXT: vmnot.m v8, v12 -; CHECK-NEXT: vmorn.mm v0, v8, v16 +; CHECK-NEXT: vmnot.m v8, v16 +; CHECK-NEXT: vmnot.m v9, v12 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %1 = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f64(<8 x double> %va, <8 x double> %vb, metadata !"uno", metadata !"fpexcept.strict") strictfp ret <8 x i1> %1 @@ -8001,8 +8134,9 @@ define <8 x i1> @fcmps_uno_vf_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK-NEXT: vfmv.v.f v12, fa0 ; CHECK-NEXT: vmfle.vv v16, v8, v8 ; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vmnot.m v9, v16 -; CHECK-NEXT: vmorn.mm v0, v9, v8 +; CHECK-NEXT: vmor.mm v0, v9, v8 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer @@ -8015,10 +8149,11 @@ define <8 x i1> @fcmps_uno_fv_v8f64(<8 x double> %va, double %b) nounwind strict ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vfmv.v.f v12, fa0 -; CHECK-NEXT: vmfle.vf v16, v12, fa0 -; CHECK-NEXT: vmnot.m v12, v16 -; CHECK-NEXT: vmfle.vv v13, v8, v8 -; CHECK-NEXT: vmorn.mm v0, v12, v13 +; CHECK-NEXT: vmfle.vv v16, v8, v8 +; CHECK-NEXT: vmfle.vf v8, v12, fa0 +; CHECK-NEXT: vmnot.m v8, v8 +; CHECK-NEXT: vmnot.m v9, v16 +; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %b, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll index bc13e1d217a9b..38d002e117f1b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmacc-vp.ll @@ -12,8 +12,10 @@ declare <2 x half> @llvm.vp.select.v2f16(<2 x i1>, <2 x half>, <2 x half>, i32) define <2 x half> @vfmacc_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -24,8 +26,10 @@ define <2 x half> @vfmacc_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, define <2 x half> @vfmacc_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -36,8 +40,10 @@ define <2 x half> @vfmacc_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x h define <2 x half> @vfmacc_vf_v2f16(<2 x half> %va, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -50,8 +56,10 @@ define <2 x half> @vfmacc_vf_v2f16(<2 x half> %va, half %b, <2 x half> %c, <2 x define <2 x half> @vfmacc_vf_v2f16_commute(<2 x half> %va, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -64,8 +72,10 @@ define <2 x half> @vfmacc_vf_v2f16_commute(<2 x half> %va, half %b, <2 x half> % define <2 x half> @vfmacc_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -78,9 +88,9 @@ define <2 x half> @vfmacc_vf_v2f16_unmasked(<2 x half> %va, half %b, <2 x half> define <2 x half> @vfmacc_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) %u = call <2 x half> @llvm.vp.select.v2f16(<2 x i1> %m, <2 x half> %v, <2 x half> %c, i32 %evl) @@ -90,9 +100,9 @@ define <2 x half> @vfmacc_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> % define <2 x half> @vfmacc_vf_v2f16_ta(<2 x half> %va, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -104,9 +114,9 @@ define <2 x half> @vfmacc_vf_v2f16_ta(<2 x half> %va, half %b, <2 x half> %c, <2 define <2 x half> @vfmacc_vf_v2f16_commute_ta(<2 x half> %va, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -123,8 +133,10 @@ declare <4 x half> @llvm.vp.select.v4f16(<4 x i1>, <4 x half>, <4 x half>, i32) define <4 x half> @vfmacc_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -135,8 +147,10 @@ define <4 x half> @vfmacc_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, define <4 x half> @vfmacc_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -147,8 +161,10 @@ define <4 x half> @vfmacc_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x h define <4 x half> @vfmacc_vf_v4f16(<4 x half> %va, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -161,8 +177,10 @@ define <4 x half> @vfmacc_vf_v4f16(<4 x half> %va, half %b, <4 x half> %c, <4 x define <4 x half> @vfmacc_vf_v4f16_commute(<4 x half> %va, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -175,8 +193,10 @@ define <4 x half> @vfmacc_vf_v4f16_commute(<4 x half> %va, half %b, <4 x half> % define <4 x half> @vfmacc_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -189,9 +209,9 @@ define <4 x half> @vfmacc_vf_v4f16_unmasked(<4 x half> %va, half %b, <4 x half> define <4 x half> @vfmacc_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) %u = call <4 x half> @llvm.vp.select.v4f16(<4 x i1> %m, <4 x half> %v, <4 x half> %c, i32 %evl) @@ -201,9 +221,9 @@ define <4 x half> @vfmacc_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> % define <4 x half> @vfmacc_vf_v4f16_ta(<4 x half> %va, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -215,9 +235,9 @@ define <4 x half> @vfmacc_vf_v4f16_ta(<4 x half> %va, half %b, <4 x half> %c, <4 define <4 x half> @vfmacc_vf_v4f16_commute_ta(<4 x half> %va, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -234,8 +254,10 @@ declare <8 x half> @llvm.vp.select.v8f16(<8 x i1>, <8 x half>, <8 x half>, i32) define <8 x half> @vfmacc_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -246,8 +268,10 @@ define <8 x half> @vfmacc_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, define <8 x half> @vfmacc_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -258,8 +282,10 @@ define <8 x half> @vfmacc_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x h define <8 x half> @vfmacc_vf_v8f16(<8 x half> %va, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -272,8 +298,10 @@ define <8 x half> @vfmacc_vf_v8f16(<8 x half> %va, half %b, <8 x half> %c, <8 x define <8 x half> @vfmacc_vf_v8f16_commute(<8 x half> %va, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -286,8 +314,10 @@ define <8 x half> @vfmacc_vf_v8f16_commute(<8 x half> %va, half %b, <8 x half> % define <8 x half> @vfmacc_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -300,9 +330,9 @@ define <8 x half> @vfmacc_vf_v8f16_unmasked(<8 x half> %va, half %b, <8 x half> define <8 x half> @vfmacc_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) %u = call <8 x half> @llvm.vp.select.v8f16(<8 x i1> %m, <8 x half> %v, <8 x half> %c, i32 %evl) @@ -312,9 +342,9 @@ define <8 x half> @vfmacc_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> % define <8 x half> @vfmacc_vf_v8f16_ta(<8 x half> %va, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -326,9 +356,9 @@ define <8 x half> @vfmacc_vf_v8f16_ta(<8 x half> %va, half %b, <8 x half> %c, <8 define <8 x half> @vfmacc_vf_v8f16_commute_ta(<8 x half> %va, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -345,8 +375,10 @@ declare <16 x half> @llvm.vp.select.v16f16(<16 x i1>, <16 x half>, <16 x half>, define <16 x half> @vfmacc_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -357,8 +389,10 @@ define <16 x half> @vfmacc_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> define <16 x half> @vfmacc_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -369,8 +403,10 @@ define <16 x half> @vfmacc_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, <1 define <16 x half> @vfmacc_vf_v16f16(<16 x half> %va, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -383,8 +419,10 @@ define <16 x half> @vfmacc_vf_v16f16(<16 x half> %va, half %b, <16 x half> %c, < define <16 x half> @vfmacc_vf_v16f16_commute(<16 x half> %va, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -397,8 +435,10 @@ define <16 x half> @vfmacc_vf_v16f16_commute(<16 x half> %va, half %b, <16 x hal define <16 x half> @vfmacc_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -411,9 +451,9 @@ define <16 x half> @vfmacc_vf_v16f16_unmasked(<16 x half> %va, half %b, <16 x ha define <16 x half> @vfmacc_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) %u = call <16 x half> @llvm.vp.select.v16f16(<16 x i1> %m, <16 x half> %v, <16 x half> %c, i32 %evl) @@ -423,9 +463,9 @@ define <16 x half> @vfmacc_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x ha define <16 x half> @vfmacc_vf_v16f16_ta(<16 x half> %va, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -437,9 +477,9 @@ define <16 x half> @vfmacc_vf_v16f16_ta(<16 x half> %va, half %b, <16 x half> %c define <16 x half> @vfmacc_vf_v16f16_commute_ta(<16 x half> %va, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -456,8 +496,10 @@ declare <32 x half> @llvm.vp.select.v32f16(<32 x i1>, <32 x half>, <32 x half>, define <32 x half> @vfmacc_vv_v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %v = call <32 x half> @llvm.vp.fma.v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -468,8 +510,10 @@ define <32 x half> @vfmacc_vv_v32f16(<32 x half> %a, <32 x half> %b, <32 x half> define <32 x half> @vfmacc_vv_v32f16_unmasked(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v32f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %v = call <32 x half> @llvm.vp.fma.v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -480,8 +524,10 @@ define <32 x half> @vfmacc_vv_v32f16_unmasked(<32 x half> %a, <32 x half> %b, <3 define <32 x half> @vfmacc_vf_v32f16(<32 x half> %va, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -494,8 +540,10 @@ define <32 x half> @vfmacc_vf_v32f16(<32 x half> %va, half %b, <32 x half> %c, < define <32 x half> @vfmacc_vf_v32f16_commute(<32 x half> %va, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v32f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -508,8 +556,10 @@ define <32 x half> @vfmacc_vf_v32f16_commute(<32 x half> %va, half %b, <32 x hal define <32 x half> @vfmacc_vf_v32f16_unmasked(<32 x half> %va, half %b, <32 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v32f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -522,9 +572,9 @@ define <32 x half> @vfmacc_vf_v32f16_unmasked(<32 x half> %va, half %b, <32 x ha define <32 x half> @vfmacc_vv_v32f16_ta(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v32f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %v = call <32 x half> @llvm.vp.fma.v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) %u = call <32 x half> @llvm.vp.select.v32f16(<32 x i1> %m, <32 x half> %v, <32 x half> %c, i32 %evl) @@ -534,9 +584,9 @@ define <32 x half> @vfmacc_vv_v32f16_ta(<32 x half> %a, <32 x half> %b, <32 x ha define <32 x half> @vfmacc_vf_v32f16_ta(<32 x half> %va, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v32f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -548,9 +598,9 @@ define <32 x half> @vfmacc_vf_v32f16_ta(<32 x half> %va, half %b, <32 x half> %c define <32 x half> @vfmacc_vf_v32f16_commute_ta(<32 x half> %va, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v32f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -567,8 +617,10 @@ declare <2 x float> @llvm.vp.select.v2f32(<2 x i1>, <2 x float>, <2 x float>, i3 define <2 x float> @vfmacc_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <2 x float> @llvm.vp.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -579,8 +631,10 @@ define <2 x float> @vfmacc_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> define <2 x float> @vfmacc_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <2 x float> @llvm.vp.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -591,8 +645,10 @@ define <2 x float> @vfmacc_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 define <2 x float> @vfmacc_vf_v2f32(<2 x float> %va, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -605,8 +661,10 @@ define <2 x float> @vfmacc_vf_v2f32(<2 x float> %va, float %b, <2 x float> %c, < define <2 x float> @vfmacc_vf_v2f32_commute(<2 x float> %va, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -619,8 +677,10 @@ define <2 x float> @vfmacc_vf_v2f32_commute(<2 x float> %va, float %b, <2 x floa define <2 x float> @vfmacc_vf_v2f32_unmasked(<2 x float> %va, float %b, <2 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -633,9 +693,9 @@ define <2 x float> @vfmacc_vf_v2f32_unmasked(<2 x float> %va, float %b, <2 x flo define <2 x float> @vfmacc_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %v = call <2 x float> @llvm.vp.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) %u = call <2 x float> @llvm.vp.select.v2f32(<2 x i1> %m, <2 x float> %v, <2 x float> %c, i32 %evl) @@ -645,9 +705,9 @@ define <2 x float> @vfmacc_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x floa define <2 x float> @vfmacc_vf_v2f32_ta(<2 x float> %va, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -659,9 +719,9 @@ define <2 x float> @vfmacc_vf_v2f32_ta(<2 x float> %va, float %b, <2 x float> %c define <2 x float> @vfmacc_vf_v2f32_commute_ta(<2 x float> %va, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -678,8 +738,10 @@ declare <4 x float> @llvm.vp.select.v4f32(<4 x i1>, <4 x float>, <4 x float>, i3 define <4 x float> @vfmacc_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -690,8 +752,10 @@ define <4 x float> @vfmacc_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> define <4 x float> @vfmacc_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -702,8 +766,10 @@ define <4 x float> @vfmacc_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 define <4 x float> @vfmacc_vf_v4f32(<4 x float> %va, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -716,8 +782,10 @@ define <4 x float> @vfmacc_vf_v4f32(<4 x float> %va, float %b, <4 x float> %c, < define <4 x float> @vfmacc_vf_v4f32_commute(<4 x float> %va, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -730,8 +798,10 @@ define <4 x float> @vfmacc_vf_v4f32_commute(<4 x float> %va, float %b, <4 x floa define <4 x float> @vfmacc_vf_v4f32_unmasked(<4 x float> %va, float %b, <4 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -744,9 +814,9 @@ define <4 x float> @vfmacc_vf_v4f32_unmasked(<4 x float> %va, float %b, <4 x flo define <4 x float> @vfmacc_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %v = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) %u = call <4 x float> @llvm.vp.select.v4f32(<4 x i1> %m, <4 x float> %v, <4 x float> %c, i32 %evl) @@ -756,9 +826,9 @@ define <4 x float> @vfmacc_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x floa define <4 x float> @vfmacc_vf_v4f32_ta(<4 x float> %va, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -770,9 +840,9 @@ define <4 x float> @vfmacc_vf_v4f32_ta(<4 x float> %va, float %b, <4 x float> %c define <4 x float> @vfmacc_vf_v4f32_commute_ta(<4 x float> %va, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -789,8 +859,10 @@ declare <8 x float> @llvm.vp.select.v8f32(<8 x i1>, <8 x float>, <8 x float>, i3 define <8 x float> @vfmacc_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %v = call <8 x float> @llvm.vp.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -801,8 +873,10 @@ define <8 x float> @vfmacc_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> define <8 x float> @vfmacc_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %v = call <8 x float> @llvm.vp.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -813,8 +887,10 @@ define <8 x float> @vfmacc_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 define <8 x float> @vfmacc_vf_v8f32(<8 x float> %va, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -827,8 +903,10 @@ define <8 x float> @vfmacc_vf_v8f32(<8 x float> %va, float %b, <8 x float> %c, < define <8 x float> @vfmacc_vf_v8f32_commute(<8 x float> %va, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -841,8 +919,10 @@ define <8 x float> @vfmacc_vf_v8f32_commute(<8 x float> %va, float %b, <8 x floa define <8 x float> @vfmacc_vf_v8f32_unmasked(<8 x float> %va, float %b, <8 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -855,9 +935,9 @@ define <8 x float> @vfmacc_vf_v8f32_unmasked(<8 x float> %va, float %b, <8 x flo define <8 x float> @vfmacc_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %v = call <8 x float> @llvm.vp.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) %u = call <8 x float> @llvm.vp.select.v8f32(<8 x i1> %m, <8 x float> %v, <8 x float> %c, i32 %evl) @@ -867,9 +947,9 @@ define <8 x float> @vfmacc_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x floa define <8 x float> @vfmacc_vf_v8f32_ta(<8 x float> %va, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -881,9 +961,9 @@ define <8 x float> @vfmacc_vf_v8f32_ta(<8 x float> %va, float %b, <8 x float> %c define <8 x float> @vfmacc_vf_v8f32_commute_ta(<8 x float> %va, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -900,8 +980,10 @@ declare <16 x float> @llvm.vp.select.v16f32(<16 x i1>, <16 x float>, <16 x float define <16 x float> @vfmacc_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %v = call <16 x float> @llvm.vp.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -912,8 +994,10 @@ define <16 x float> @vfmacc_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x fl define <16 x float> @vfmacc_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %v = call <16 x float> @llvm.vp.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -924,8 +1008,10 @@ define <16 x float> @vfmacc_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b, define <16 x float> @vfmacc_vf_v16f32(<16 x float> %va, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -938,8 +1024,10 @@ define <16 x float> @vfmacc_vf_v16f32(<16 x float> %va, float %b, <16 x float> % define <16 x float> @vfmacc_vf_v16f32_commute(<16 x float> %va, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -952,8 +1040,10 @@ define <16 x float> @vfmacc_vf_v16f32_commute(<16 x float> %va, float %b, <16 x define <16 x float> @vfmacc_vf_v16f32_unmasked(<16 x float> %va, float %b, <16 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -966,9 +1056,9 @@ define <16 x float> @vfmacc_vf_v16f32_unmasked(<16 x float> %va, float %b, <16 x define <16 x float> @vfmacc_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %v = call <16 x float> @llvm.vp.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) %u = call <16 x float> @llvm.vp.select.v16f32(<16 x i1> %m, <16 x float> %v, <16 x float> %c, i32 %evl) @@ -978,9 +1068,9 @@ define <16 x float> @vfmacc_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 x define <16 x float> @vfmacc_vf_v16f32_ta(<16 x float> %va, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -992,9 +1082,9 @@ define <16 x float> @vfmacc_vf_v16f32_ta(<16 x float> %va, float %b, <16 x float define <16 x float> @vfmacc_vf_v16f32_commute_ta(<16 x float> %va, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v16f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1011,8 +1101,10 @@ declare <2 x double> @llvm.vp.select.v2f64(<2 x i1>, <2 x double>, <2 x double>, define <2 x double> @vfmacc_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <2 x double> @llvm.vp.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1023,8 +1115,10 @@ define <2 x double> @vfmacc_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x doub define <2 x double> @vfmacc_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %v = call <2 x double> @llvm.vp.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1035,8 +1129,10 @@ define <2 x double> @vfmacc_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, define <2 x double> @vfmacc_vf_v2f64(<2 x double> %va, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1049,8 +1145,10 @@ define <2 x double> @vfmacc_vf_v2f64(<2 x double> %va, double %b, <2 x double> % define <2 x double> @vfmacc_vf_v2f64_commute(<2 x double> %va, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1063,8 +1161,10 @@ define <2 x double> @vfmacc_vf_v2f64_commute(<2 x double> %va, double %b, <2 x d define <2 x double> @vfmacc_vf_v2f64_unmasked(<2 x double> %va, double %b, <2 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1077,9 +1177,9 @@ define <2 x double> @vfmacc_vf_v2f64_unmasked(<2 x double> %va, double %b, <2 x define <2 x double> @vfmacc_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %v = call <2 x double> @llvm.vp.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) %u = call <2 x double> @llvm.vp.select.v2f64(<2 x i1> %m, <2 x double> %v, <2 x double> %c, i32 %evl) @@ -1089,9 +1189,9 @@ define <2 x double> @vfmacc_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x d define <2 x double> @vfmacc_vf_v2f64_ta(<2 x double> %va, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1103,9 +1203,9 @@ define <2 x double> @vfmacc_vf_v2f64_ta(<2 x double> %va, double %b, <2 x double define <2 x double> @vfmacc_vf_v2f64_commute_ta(<2 x double> %va, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v2f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1122,8 +1222,10 @@ declare <4 x double> @llvm.vp.select.v4f64(<4 x i1>, <4 x double>, <4 x double>, define <4 x double> @vfmacc_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %v = call <4 x double> @llvm.vp.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1134,8 +1236,10 @@ define <4 x double> @vfmacc_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x doub define <4 x double> @vfmacc_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %v = call <4 x double> @llvm.vp.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1146,8 +1250,10 @@ define <4 x double> @vfmacc_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, define <4 x double> @vfmacc_vf_v4f64(<4 x double> %va, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1160,8 +1266,10 @@ define <4 x double> @vfmacc_vf_v4f64(<4 x double> %va, double %b, <4 x double> % define <4 x double> @vfmacc_vf_v4f64_commute(<4 x double> %va, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1174,8 +1282,10 @@ define <4 x double> @vfmacc_vf_v4f64_commute(<4 x double> %va, double %b, <4 x d define <4 x double> @vfmacc_vf_v4f64_unmasked(<4 x double> %va, double %b, <4 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1188,9 +1298,9 @@ define <4 x double> @vfmacc_vf_v4f64_unmasked(<4 x double> %va, double %b, <4 x define <4 x double> @vfmacc_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %v = call <4 x double> @llvm.vp.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) %u = call <4 x double> @llvm.vp.select.v4f64(<4 x i1> %m, <4 x double> %v, <4 x double> %c, i32 %evl) @@ -1200,9 +1310,9 @@ define <4 x double> @vfmacc_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x d define <4 x double> @vfmacc_vf_v4f64_ta(<4 x double> %va, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1214,9 +1324,9 @@ define <4 x double> @vfmacc_vf_v4f64_ta(<4 x double> %va, double %b, <4 x double define <4 x double> @vfmacc_vf_v4f64_commute_ta(<4 x double> %va, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v4f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1233,8 +1343,10 @@ declare <8 x double> @llvm.vp.select.v8f64(<8 x i1>, <8 x double>, <8 x double>, define <8 x double> @vfmacc_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %v = call <8 x double> @llvm.vp.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1245,8 +1357,10 @@ define <8 x double> @vfmacc_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x doub define <8 x double> @vfmacc_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %v = call <8 x double> @llvm.vp.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1257,8 +1371,10 @@ define <8 x double> @vfmacc_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, define <8 x double> @vfmacc_vf_v8f64(<8 x double> %va, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1271,8 +1387,10 @@ define <8 x double> @vfmacc_vf_v8f64(<8 x double> %va, double %b, <8 x double> % define <8 x double> @vfmacc_vf_v8f64_commute(<8 x double> %va, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1285,8 +1403,10 @@ define <8 x double> @vfmacc_vf_v8f64_commute(<8 x double> %va, double %b, <8 x d define <8 x double> @vfmacc_vf_v8f64_unmasked(<8 x double> %va, double %b, <8 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1299,9 +1419,9 @@ define <8 x double> @vfmacc_vf_v8f64_unmasked(<8 x double> %va, double %b, <8 x define <8 x double> @vfmacc_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vv_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %v = call <8 x double> @llvm.vp.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) %u = call <8 x double> @llvm.vp.select.v8f64(<8 x i1> %m, <8 x double> %v, <8 x double> %c, i32 %evl) @@ -1311,9 +1431,9 @@ define <8 x double> @vfmacc_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x d define <8 x double> @vfmacc_vf_v8f64_ta(<8 x double> %va, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -1325,9 +1445,9 @@ define <8 x double> @vfmacc_vf_v8f64_ta(<8 x double> %va, double %b, <8 x double define <8 x double> @vfmacc_vf_v8f64_commute_ta(<8 x double> %va, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmacc_vf_v8f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll index 99fc035235671..02e603de5838a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsac-vp.ll @@ -12,8 +12,11 @@ declare <2 x half> @llvm.vp.select.v2f16(<2 x i1>, <2 x half>, <2 x half>, i32) define <2 x half> @vfmsac_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -25,8 +28,11 @@ define <2 x half> @vfmsac_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, define <2 x half> @vfmsac_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -38,8 +44,11 @@ define <2 x half> @vfmsac_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x h define <2 x half> @vfmsac_vf_v2f16(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -53,8 +62,11 @@ define <2 x half> @vfmsac_vf_v2f16(<2 x half> %a, half %b, <2 x half> %c, <2 x i define <2 x half> @vfmsac_vf_v2f16_commute(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -68,8 +80,11 @@ define <2 x half> @vfmsac_vf_v2f16_commute(<2 x half> %a, half %b, <2 x half> %c define <2 x half> @vfmsac_vf_v2f16_unmasked(<2 x half> %a, half %b, <2 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -83,9 +98,10 @@ define <2 x half> @vfmsac_vf_v2f16_unmasked(<2 x half> %a, half %b, <2 x half> % define <2 x half> @vfmsac_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %negc = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %negc, <2 x i1> splat (i1 -1), i32 %evl) @@ -96,9 +112,10 @@ define <2 x half> @vfmsac_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> % define <2 x half> @vfmsac_vf_v2f16_ta(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -111,9 +128,10 @@ define <2 x half> @vfmsac_vf_v2f16_ta(<2 x half> %a, half %b, <2 x half> %c, <2 define <2 x half> @vfmsac_vf_v2f16_commute_ta(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -131,8 +149,11 @@ declare <4 x half> @llvm.vp.select.v4f16(<4 x i1>, <4 x half>, <4 x half>, i32) define <4 x half> @vfmsac_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -144,8 +165,11 @@ define <4 x half> @vfmsac_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, define <4 x half> @vfmsac_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -157,8 +181,11 @@ define <4 x half> @vfmsac_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x h define <4 x half> @vfmsac_vf_v4f16(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -172,8 +199,11 @@ define <4 x half> @vfmsac_vf_v4f16(<4 x half> %a, half %b, <4 x half> %c, <4 x i define <4 x half> @vfmsac_vf_v4f16_commute(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -187,8 +217,11 @@ define <4 x half> @vfmsac_vf_v4f16_commute(<4 x half> %a, half %b, <4 x half> %c define <4 x half> @vfmsac_vf_v4f16_unmasked(<4 x half> %a, half %b, <4 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -202,9 +235,10 @@ define <4 x half> @vfmsac_vf_v4f16_unmasked(<4 x half> %a, half %b, <4 x half> % define <4 x half> @vfmsac_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %negc = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %negc, <4 x i1> splat (i1 -1), i32 %evl) @@ -215,9 +249,10 @@ define <4 x half> @vfmsac_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> % define <4 x half> @vfmsac_vf_v4f16_ta(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -230,9 +265,10 @@ define <4 x half> @vfmsac_vf_v4f16_ta(<4 x half> %a, half %b, <4 x half> %c, <4 define <4 x half> @vfmsac_vf_v4f16_commute_ta(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -250,8 +286,11 @@ declare <8 x half> @llvm.vp.select.v8f16(<8 x i1>, <8 x half>, <8 x half>, i32) define <8 x half> @vfmsac_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -263,8 +302,11 @@ define <8 x half> @vfmsac_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, define <8 x half> @vfmsac_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -276,8 +318,11 @@ define <8 x half> @vfmsac_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x h define <8 x half> @vfmsac_vf_v8f16(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -291,8 +336,11 @@ define <8 x half> @vfmsac_vf_v8f16(<8 x half> %a, half %b, <8 x half> %c, <8 x i define <8 x half> @vfmsac_vf_v8f16_commute(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -306,8 +354,11 @@ define <8 x half> @vfmsac_vf_v8f16_commute(<8 x half> %a, half %b, <8 x half> %c define <8 x half> @vfmsac_vf_v8f16_unmasked(<8 x half> %a, half %b, <8 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -321,9 +372,10 @@ define <8 x half> @vfmsac_vf_v8f16_unmasked(<8 x half> %a, half %b, <8 x half> % define <8 x half> @vfmsac_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %negc = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %negc, <8 x i1> splat (i1 -1), i32 %evl) @@ -334,9 +386,10 @@ define <8 x half> @vfmsac_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> % define <8 x half> @vfmsac_vf_v8f16_ta(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -349,9 +402,10 @@ define <8 x half> @vfmsac_vf_v8f16_ta(<8 x half> %a, half %b, <8 x half> %c, <8 define <8 x half> @vfmsac_vf_v8f16_commute_ta(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -369,8 +423,11 @@ declare <16 x half> @llvm.vp.select.v16f16(<16 x i1>, <16 x half>, <16 x half>, define <16 x half> @vfmsac_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %negc = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -382,8 +439,11 @@ define <16 x half> @vfmsac_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> define <16 x half> @vfmsac_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v14 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %negc = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -395,8 +455,11 @@ define <16 x half> @vfmsac_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, <1 define <16 x half> @vfmsac_vf_v16f16(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -410,8 +473,11 @@ define <16 x half> @vfmsac_vf_v16f16(<16 x half> %a, half %b, <16 x half> %c, <1 define <16 x half> @vfmsac_vf_v16f16_commute(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -425,8 +491,11 @@ define <16 x half> @vfmsac_vf_v16f16_commute(<16 x half> %a, half %b, <16 x half define <16 x half> @vfmsac_vf_v16f16_unmasked(<16 x half> %a, half %b, <16 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfmsac.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v12 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -440,9 +509,10 @@ define <16 x half> @vfmsac_vf_v16f16_unmasked(<16 x half> %a, half %b, <16 x hal define <16 x half> @vfmsac_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v14, v0 ; CHECK-NEXT: ret %negc = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %negc, <16 x i1> splat (i1 -1), i32 %evl) @@ -453,9 +523,10 @@ define <16 x half> @vfmsac_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x ha define <16 x half> @vfmsac_vf_v16f16_ta(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -468,9 +539,10 @@ define <16 x half> @vfmsac_vf_v16f16_ta(<16 x half> %a, half %b, <16 x half> %c, define <16 x half> @vfmsac_vf_v16f16_commute_ta(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -488,8 +560,11 @@ declare <32 x half> @llvm.vp.select.v32f16(<32 x i1>, <32 x half>, <32 x half>, define <32 x half> @vfmsac_vv_v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v20, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %negc = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -501,8 +576,11 @@ define <32 x half> @vfmsac_vv_v32f16(<32 x half> %a, <32 x half> %b, <32 x half> define <32 x half> @vfmsac_vv_v32f16_unmasked(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v32f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v20 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %negc = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -514,8 +592,11 @@ define <32 x half> @vfmsac_vv_v32f16_unmasked(<32 x half> %a, <32 x half> %b, <3 define <32 x half> @vfmsac_vf_v32f16(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -529,8 +610,11 @@ define <32 x half> @vfmsac_vf_v32f16(<32 x half> %a, half %b, <32 x half> %c, <3 define <32 x half> @vfmsac_vf_v32f16_commute(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v32f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -544,8 +628,11 @@ define <32 x half> @vfmsac_vf_v32f16_commute(<32 x half> %a, half %b, <32 x half define <32 x half> @vfmsac_vf_v32f16_unmasked(<32 x half> %a, half %b, <32 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v32f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfmsac.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v16 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -559,9 +646,10 @@ define <32 x half> @vfmsac_vf_v32f16_unmasked(<32 x half> %a, half %b, <32 x hal define <32 x half> @vfmsac_vv_v32f16_ta(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v32f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v20, v0 ; CHECK-NEXT: ret %negc = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) %v = call <32 x half> @llvm.vp.fma.v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %negc, <32 x i1> splat (i1 -1), i32 %evl) @@ -572,9 +660,10 @@ define <32 x half> @vfmsac_vv_v32f16_ta(<32 x half> %a, <32 x half> %b, <32 x ha define <32 x half> @vfmsac_vf_v32f16_ta(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v32f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -587,9 +676,10 @@ define <32 x half> @vfmsac_vf_v32f16_ta(<32 x half> %a, half %b, <32 x half> %c, define <32 x half> @vfmsac_vf_v32f16_commute_ta(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v32f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -607,8 +697,11 @@ declare <2 x float> @llvm.vp.select.v2f32(<2 x i1>, <2 x float>, <2 x float>, i3 define <2 x float> @vfmsac_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -620,8 +713,11 @@ define <2 x float> @vfmsac_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> define <2 x float> @vfmsac_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -633,8 +729,11 @@ define <2 x float> @vfmsac_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 define <2 x float> @vfmsac_vf_v2f32(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -648,8 +747,11 @@ define <2 x float> @vfmsac_vf_v2f32(<2 x float> %a, float %b, <2 x float> %c, <2 define <2 x float> @vfmsac_vf_v2f32_commute(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -663,8 +765,11 @@ define <2 x float> @vfmsac_vf_v2f32_commute(<2 x float> %a, float %b, <2 x float define <2 x float> @vfmsac_vf_v2f32_unmasked(<2 x float> %a, float %b, <2 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -678,9 +783,10 @@ define <2 x float> @vfmsac_vf_v2f32_unmasked(<2 x float> %a, float %b, <2 x floa define <2 x float> @vfmsac_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %negc = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) %v = call <2 x float> @llvm.vp.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %negc, <2 x i1> splat (i1 -1), i32 %evl) @@ -691,9 +797,10 @@ define <2 x float> @vfmsac_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x floa define <2 x float> @vfmsac_vf_v2f32_ta(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -706,9 +813,10 @@ define <2 x float> @vfmsac_vf_v2f32_ta(<2 x float> %a, float %b, <2 x float> %c, define <2 x float> @vfmsac_vf_v2f32_commute_ta(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -726,8 +834,11 @@ declare <4 x float> @llvm.vp.select.v4f32(<4 x i1>, <4 x float>, <4 x float>, i3 define <4 x float> @vfmsac_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -739,8 +850,11 @@ define <4 x float> @vfmsac_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> define <4 x float> @vfmsac_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -752,8 +866,11 @@ define <4 x float> @vfmsac_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 define <4 x float> @vfmsac_vf_v4f32(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -767,8 +884,11 @@ define <4 x float> @vfmsac_vf_v4f32(<4 x float> %a, float %b, <4 x float> %c, <4 define <4 x float> @vfmsac_vf_v4f32_commute(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -782,8 +902,11 @@ define <4 x float> @vfmsac_vf_v4f32_commute(<4 x float> %a, float %b, <4 x float define <4 x float> @vfmsac_vf_v4f32_unmasked(<4 x float> %a, float %b, <4 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -797,9 +920,10 @@ define <4 x float> @vfmsac_vf_v4f32_unmasked(<4 x float> %a, float %b, <4 x floa define <4 x float> @vfmsac_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %negc = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) %v = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %negc, <4 x i1> splat (i1 -1), i32 %evl) @@ -810,9 +934,10 @@ define <4 x float> @vfmsac_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x floa define <4 x float> @vfmsac_vf_v4f32_ta(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -825,9 +950,10 @@ define <4 x float> @vfmsac_vf_v4f32_ta(<4 x float> %a, float %b, <4 x float> %c, define <4 x float> @vfmsac_vf_v4f32_commute_ta(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -845,8 +971,11 @@ declare <8 x float> @llvm.vp.select.v8f32(<8 x i1>, <8 x float>, <8 x float>, i3 define <8 x float> @vfmsac_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %negc = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -858,8 +987,11 @@ define <8 x float> @vfmsac_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> define <8 x float> @vfmsac_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v14 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %negc = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -871,8 +1003,11 @@ define <8 x float> @vfmsac_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 define <8 x float> @vfmsac_vf_v8f32(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -886,8 +1021,11 @@ define <8 x float> @vfmsac_vf_v8f32(<8 x float> %a, float %b, <8 x float> %c, <8 define <8 x float> @vfmsac_vf_v8f32_commute(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -901,8 +1039,11 @@ define <8 x float> @vfmsac_vf_v8f32_commute(<8 x float> %a, float %b, <8 x float define <8 x float> @vfmsac_vf_v8f32_unmasked(<8 x float> %a, float %b, <8 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfmsac.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v12 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -916,9 +1057,10 @@ define <8 x float> @vfmsac_vf_v8f32_unmasked(<8 x float> %a, float %b, <8 x floa define <8 x float> @vfmsac_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v14, v0 ; CHECK-NEXT: ret %negc = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) %v = call <8 x float> @llvm.vp.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %negc, <8 x i1> splat (i1 -1), i32 %evl) @@ -929,9 +1071,10 @@ define <8 x float> @vfmsac_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x floa define <8 x float> @vfmsac_vf_v8f32_ta(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -944,9 +1087,10 @@ define <8 x float> @vfmsac_vf_v8f32_ta(<8 x float> %a, float %b, <8 x float> %c, define <8 x float> @vfmsac_vf_v8f32_commute_ta(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -964,8 +1108,11 @@ declare <16 x float> @llvm.vp.select.v16f32(<16 x i1>, <16 x float>, <16 x float define <16 x float> @vfmsac_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v20, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %negc = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -977,8 +1124,11 @@ define <16 x float> @vfmsac_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x fl define <16 x float> @vfmsac_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v20 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %negc = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -990,8 +1140,11 @@ define <16 x float> @vfmsac_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b, define <16 x float> @vfmsac_vf_v16f32(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1005,8 +1158,11 @@ define <16 x float> @vfmsac_vf_v16f32(<16 x float> %a, float %b, <16 x float> %c define <16 x float> @vfmsac_vf_v16f32_commute(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1020,8 +1176,11 @@ define <16 x float> @vfmsac_vf_v16f32_commute(<16 x float> %a, float %b, <16 x f define <16 x float> @vfmsac_vf_v16f32_unmasked(<16 x float> %a, float %b, <16 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfmsac.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v16 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1035,9 +1194,10 @@ define <16 x float> @vfmsac_vf_v16f32_unmasked(<16 x float> %a, float %b, <16 x define <16 x float> @vfmsac_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v20, v0 ; CHECK-NEXT: ret %negc = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) %v = call <16 x float> @llvm.vp.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %negc, <16 x i1> splat (i1 -1), i32 %evl) @@ -1048,9 +1208,10 @@ define <16 x float> @vfmsac_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 x define <16 x float> @vfmsac_vf_v16f32_ta(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1063,9 +1224,10 @@ define <16 x float> @vfmsac_vf_v16f32_ta(<16 x float> %a, float %b, <16 x float> define <16 x float> @vfmsac_vf_v16f32_commute_ta(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v16f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1083,8 +1245,11 @@ declare <2 x double> @llvm.vp.select.v2f64(<2 x i1>, <2 x double>, <2 x double>, define <2 x double> @vfmsac_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1096,8 +1261,11 @@ define <2 x double> @vfmsac_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x doub define <2 x double> @vfmsac_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %negc = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1109,8 +1277,11 @@ define <2 x double> @vfmsac_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, define <2 x double> @vfmsac_vf_v2f64(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1124,8 +1295,11 @@ define <2 x double> @vfmsac_vf_v2f64(<2 x double> %a, double %b, <2 x double> %c define <2 x double> @vfmsac_vf_v2f64_commute(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1139,8 +1313,11 @@ define <2 x double> @vfmsac_vf_v2f64_commute(<2 x double> %a, double %b, <2 x do define <2 x double> @vfmsac_vf_v2f64_unmasked(<2 x double> %a, double %b, <2 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1154,9 +1331,10 @@ define <2 x double> @vfmsac_vf_v2f64_unmasked(<2 x double> %a, double %b, <2 x d define <2 x double> @vfmsac_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %negc = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) %v = call <2 x double> @llvm.vp.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %negc, <2 x i1> splat (i1 -1), i32 %evl) @@ -1167,9 +1345,10 @@ define <2 x double> @vfmsac_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x d define <2 x double> @vfmsac_vf_v2f64_ta(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1182,9 +1361,10 @@ define <2 x double> @vfmsac_vf_v2f64_ta(<2 x double> %a, double %b, <2 x double> define <2 x double> @vfmsac_vf_v2f64_commute_ta(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v2f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1202,8 +1382,11 @@ declare <4 x double> @llvm.vp.select.v4f64(<4 x i1>, <4 x double>, <4 x double>, define <4 x double> @vfmsac_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %negc = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1215,8 +1398,11 @@ define <4 x double> @vfmsac_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x doub define <4 x double> @vfmsac_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v14 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %negc = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1228,8 +1414,11 @@ define <4 x double> @vfmsac_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, define <4 x double> @vfmsac_vf_v4f64(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1243,8 +1432,11 @@ define <4 x double> @vfmsac_vf_v4f64(<4 x double> %a, double %b, <4 x double> %c define <4 x double> @vfmsac_vf_v4f64_commute(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1258,8 +1450,11 @@ define <4 x double> @vfmsac_vf_v4f64_commute(<4 x double> %a, double %b, <4 x do define <4 x double> @vfmsac_vf_v4f64_unmasked(<4 x double> %a, double %b, <4 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfmsac.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v12 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1273,9 +1468,10 @@ define <4 x double> @vfmsac_vf_v4f64_unmasked(<4 x double> %a, double %b, <4 x d define <4 x double> @vfmsac_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v14, v0 ; CHECK-NEXT: ret %negc = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) %v = call <4 x double> @llvm.vp.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %negc, <4 x i1> splat (i1 -1), i32 %evl) @@ -1286,9 +1482,10 @@ define <4 x double> @vfmsac_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x d define <4 x double> @vfmsac_vf_v4f64_ta(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1301,9 +1498,10 @@ define <4 x double> @vfmsac_vf_v4f64_ta(<4 x double> %a, double %b, <4 x double> define <4 x double> @vfmsac_vf_v4f64_commute_ta(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v4f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1321,8 +1519,11 @@ declare <8 x double> @llvm.vp.select.v8f64(<8 x i1>, <8 x double>, <8 x double>, define <8 x double> @vfmsac_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v20, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %negc = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1334,8 +1535,11 @@ define <8 x double> @vfmsac_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x doub define <8 x double> @vfmsac_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v20 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %negc = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1347,8 +1551,11 @@ define <8 x double> @vfmsac_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, define <8 x double> @vfmsac_vf_v8f64(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1362,8 +1569,11 @@ define <8 x double> @vfmsac_vf_v8f64(<8 x double> %a, double %b, <8 x double> %c define <8 x double> @vfmsac_vf_v8f64_commute(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1377,8 +1587,11 @@ define <8 x double> @vfmsac_vf_v8f64_commute(<8 x double> %a, double %b, <8 x do define <8 x double> @vfmsac_vf_v8f64_unmasked(<8 x double> %a, double %b, <8 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfmsac.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v16 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1392,9 +1605,10 @@ define <8 x double> @vfmsac_vf_v8f64_unmasked(<8 x double> %a, double %b, <8 x d define <8 x double> @vfmsac_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vv_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v20, v0 ; CHECK-NEXT: ret %negc = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) %v = call <8 x double> @llvm.vp.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %negc, <8 x i1> splat (i1 -1), i32 %evl) @@ -1405,9 +1619,10 @@ define <8 x double> @vfmsac_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x d define <8 x double> @vfmsac_vf_v8f64_ta(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -1420,9 +1635,10 @@ define <8 x double> @vfmsac_vf_v8f64_ta(<8 x double> %a, double %b, <8 x double> define <8 x double> @vfmsac_vf_v8f64_commute_ta(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfmsac_vf_v8f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll index 268494bf337e1..8a9d0a24b4584 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmsub-constrained-sdnode.ll @@ -13,7 +13,8 @@ define <2 x half> @vfmsub_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x half> %v ; CHECK-LABEL: vfmsub_vv_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x half> %vb %vd = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %va, <2 x half> %vc, <2 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -24,7 +25,8 @@ define <2 x half> @vfmsub_vf_v2f16(<2 x half> %va, <2 x half> %vb, half %c) stri ; CHECK-LABEL: vfmsub_vf_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -39,7 +41,8 @@ define <4 x half> @vfmsub_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x half> %v ; CHECK-LABEL: vfmsub_vv_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %neg = fneg <4 x half> %vc %vd = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %vb, <4 x half> %va, <4 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -50,7 +53,8 @@ define <4 x half> @vfmsub_vf_v4f16(<4 x half> %va, <4 x half> %vb, half %c) stri ; CHECK-LABEL: vfmsub_vf_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -65,7 +69,8 @@ define <8 x half> @vfmsub_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x half> %v ; CHECK-LABEL: vfmsub_vv_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmsac.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <8 x half> %va %vd = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %vb, <8 x half> %vc, <8 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -76,7 +81,8 @@ define <8 x half> @vfmsub_vf_v8f16(<8 x half> %va, <8 x half> %vb, half %c) stri ; CHECK-LABEL: vfmsub_vf_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -91,7 +97,8 @@ define <16 x half> @vfmsub_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x hal ; CHECK-LABEL: vfmsub_vv_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v12, v10 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %neg = fneg <16 x half> %vb %vd = call <16 x half> @llvm.experimental.constrained.fma.v16f16(<16 x half> %vc, <16 x half> %va, <16 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -102,7 +109,8 @@ define <16 x half> @vfmsub_vf_v16f16(<16 x half> %va, <16 x half> %vb, half %c) ; CHECK-LABEL: vfmsub_vf_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfmsub.vf v8, fa0, v10 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -118,7 +126,8 @@ define <32 x half> @vfmsub_vv_v32f16(<32 x half> %va, <32 x half> %vb, <32 x hal ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfmsac.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <32 x half> %va %vd = call <32 x half> @llvm.experimental.constrained.fma.v32f16(<32 x half> %vc, <32 x half> %vb, <32 x half> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -130,7 +139,8 @@ define <32 x half> @vfmsub_vf_v32f16(<32 x half> %va, <32 x half> %vb, half %c) ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v12 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %c, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -145,7 +155,8 @@ define <2 x float> @vfmsub_vv_v2f32(<2 x float> %va, <2 x float> %vb, <2 x float ; CHECK-LABEL: vfmsub_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x float> %vb %vd = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %va, <2 x float> %vc, <2 x float> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -156,7 +167,8 @@ define <2 x float> @vfmsub_vf_v2f32(<2 x float> %va, <2 x float> %vb, float %c) ; CHECK-LABEL: vfmsub_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -171,7 +183,8 @@ define <4 x float> @vfmsub_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x float ; CHECK-LABEL: vfmsub_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %neg = fneg <4 x float> %vc %vd = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %vb, <4 x float> %va, <4 x float> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -182,7 +195,8 @@ define <4 x float> @vfmsub_vf_v4f32(<4 x float> %va, <4 x float> %vb, float %c) ; CHECK-LABEL: vfmsub_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -197,7 +211,8 @@ define <8 x float> @vfmsub_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x float ; CHECK-LABEL: vfmsub_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfmsac.vv v8, v12, v10 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %neg = fneg <8 x float> %va %vd = call <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float> %vb, <8 x float> %vc, <8 x float> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -208,7 +223,8 @@ define <8 x float> @vfmsub_vf_v8f32(<8 x float> %va, <8 x float> %vb, float %c) ; CHECK-LABEL: vfmsub_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v10 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -223,7 +239,8 @@ define <16 x float> @vfmsub_vv_v16f32(<16 x float> %va, <16 x float> %vb, <16 x ; CHECK-LABEL: vfmsub_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <16 x float> %vb %vd = call <16 x float> @llvm.experimental.constrained.fma.v16f32(<16 x float> %vc, <16 x float> %va, <16 x float> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -234,7 +251,8 @@ define <16 x float> @vfmsub_vf_v16f32(<16 x float> %va, <16 x float> %vb, float ; CHECK-LABEL: vfmsub_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfmsub.vf v8, fa0, v12 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %c, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -249,7 +267,8 @@ define <2 x double> @vfmsub_vv_v2f64(<2 x double> %va, <2 x double> %vb, <2 x do ; CHECK-LABEL: vfmsub_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x double> %vb %vd = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %va, <2 x double> %vc, <2 x double> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -260,7 +279,8 @@ define <2 x double> @vfmsub_vf_v2f64(<2 x double> %va, <2 x double> %vb, double ; CHECK-LABEL: vfmsub_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %c, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -275,7 +295,8 @@ define <4 x double> @vfmsub_vv_v4f64(<4 x double> %va, <4 x double> %vb, <4 x do ; CHECK-LABEL: vfmsub_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmsub.vv v8, v10, v12 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 ; CHECK-NEXT: ret %neg = fneg <4 x double> %vc %vd = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %vb, <4 x double> %va, <4 x double> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -286,7 +307,8 @@ define <4 x double> @vfmsub_vf_v4f64(<4 x double> %va, <4 x double> %vb, double ; CHECK-LABEL: vfmsub_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmsub.vf v8, fa0, v10 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %c, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -301,7 +323,8 @@ define <8 x double> @vfmsub_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x do ; CHECK-LABEL: vfmsub_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfmsac.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <8 x double> %va %vd = call <8 x double> @llvm.experimental.constrained.fma.v8f64(<8 x double> %vb, <8 x double> %vc, <8 x double> %neg, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -312,7 +335,8 @@ define <8 x double> @vfmsub_vf_v8f64(<8 x double> %va, <8 x double> %vb, double ; CHECK-LABEL: vfmsub_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa0, v12 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %c, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll index 4ab94444b1b89..127b6cd706cf9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmacc-vp.ll @@ -12,8 +12,12 @@ declare <2 x half> @llvm.vp.select.v2f16(<2 x i1>, <2 x half>, <2 x half>, i32) define <2 x half> @vfnmacc_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -26,8 +30,12 @@ define <2 x half> @vfnmacc_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, define <2 x half> @vfnmacc_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -40,8 +48,12 @@ define <2 x half> @vfnmacc_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x define <2 x half> @vfnmacc_vf_v2f16(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -56,8 +68,12 @@ define <2 x half> @vfnmacc_vf_v2f16(<2 x half> %a, half %b, <2 x half> %c, <2 x define <2 x half> @vfnmacc_vf_v2f16_commute(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -72,8 +88,12 @@ define <2 x half> @vfnmacc_vf_v2f16_commute(<2 x half> %a, half %b, <2 x half> % define <2 x half> @vfnmacc_vf_v2f16_unmasked(<2 x half> %a, half %b, <2 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -88,9 +108,11 @@ define <2 x half> @vfnmacc_vf_v2f16_unmasked(<2 x half> %a, half %b, <2 x half> define <2 x half> @vfnmacc_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %nega = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %a, <2 x i1> splat (i1 -1), i32 %evl) %negc = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -102,9 +124,11 @@ define <2 x half> @vfnmacc_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> define <2 x half> @vfnmacc_vf_v2f16_ta(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -118,9 +142,11 @@ define <2 x half> @vfnmacc_vf_v2f16_ta(<2 x half> %a, half %b, <2 x half> %c, <2 define <2 x half> @vfnmacc_vf_v2f16_commute_ta(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -139,8 +165,12 @@ declare <4 x half> @llvm.vp.select.v4f16(<4 x i1>, <4 x half>, <4 x half>, i32) define <4 x half> @vfnmacc_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -153,8 +183,12 @@ define <4 x half> @vfnmacc_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, define <4 x half> @vfnmacc_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -167,8 +201,12 @@ define <4 x half> @vfnmacc_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x define <4 x half> @vfnmacc_vf_v4f16(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -183,8 +221,12 @@ define <4 x half> @vfnmacc_vf_v4f16(<4 x half> %a, half %b, <4 x half> %c, <4 x define <4 x half> @vfnmacc_vf_v4f16_commute(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -199,8 +241,12 @@ define <4 x half> @vfnmacc_vf_v4f16_commute(<4 x half> %a, half %b, <4 x half> % define <4 x half> @vfnmacc_vf_v4f16_unmasked(<4 x half> %a, half %b, <4 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -215,9 +261,11 @@ define <4 x half> @vfnmacc_vf_v4f16_unmasked(<4 x half> %a, half %b, <4 x half> define <4 x half> @vfnmacc_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %nega = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %a, <4 x i1> splat (i1 -1), i32 %evl) %negc = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -229,9 +277,11 @@ define <4 x half> @vfnmacc_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> define <4 x half> @vfnmacc_vf_v4f16_ta(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -245,9 +295,11 @@ define <4 x half> @vfnmacc_vf_v4f16_ta(<4 x half> %a, half %b, <4 x half> %c, <4 define <4 x half> @vfnmacc_vf_v4f16_commute_ta(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -266,8 +318,12 @@ declare <8 x half> @llvm.vp.select.v8f16(<8 x i1>, <8 x half>, <8 x half>, i32) define <8 x half> @vfnmacc_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -280,8 +336,12 @@ define <8 x half> @vfnmacc_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, define <8 x half> @vfnmacc_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -294,8 +354,12 @@ define <8 x half> @vfnmacc_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x define <8 x half> @vfnmacc_vf_v8f16(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -310,8 +374,12 @@ define <8 x half> @vfnmacc_vf_v8f16(<8 x half> %a, half %b, <8 x half> %c, <8 x define <8 x half> @vfnmacc_vf_v8f16_commute(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -326,8 +394,12 @@ define <8 x half> @vfnmacc_vf_v8f16_commute(<8 x half> %a, half %b, <8 x half> % define <8 x half> @vfnmacc_vf_v8f16_unmasked(<8 x half> %a, half %b, <8 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -342,9 +414,11 @@ define <8 x half> @vfnmacc_vf_v8f16_unmasked(<8 x half> %a, half %b, <8 x half> define <8 x half> @vfnmacc_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %nega = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %a, <8 x i1> splat (i1 -1), i32 %evl) %negc = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -356,9 +430,11 @@ define <8 x half> @vfnmacc_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> define <8 x half> @vfnmacc_vf_v8f16_ta(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -372,9 +448,11 @@ define <8 x half> @vfnmacc_vf_v8f16_ta(<8 x half> %a, half %b, <8 x half> %c, <8 define <8 x half> @vfnmacc_vf_v8f16_commute_ta(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -393,8 +471,12 @@ declare <16 x half> @llvm.vp.select.v16f16(<16 x i1>, <16 x half>, <16 x half>, define <16 x half> @vfnmacc_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfnmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -407,8 +489,12 @@ define <16 x half> @vfnmacc_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half define <16 x half> @vfnmacc_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfnmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v14 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -421,8 +507,12 @@ define <16 x half> @vfnmacc_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, < define <16 x half> @vfnmacc_vf_v16f16(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -437,8 +527,12 @@ define <16 x half> @vfnmacc_vf_v16f16(<16 x half> %a, half %b, <16 x half> %c, < define <16 x half> @vfnmacc_vf_v16f16_commute(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -453,8 +547,12 @@ define <16 x half> @vfnmacc_vf_v16f16_commute(<16 x half> %a, half %b, <16 x hal define <16 x half> @vfnmacc_vf_v16f16_unmasked(<16 x half> %a, half %b, <16 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v12 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -469,9 +567,11 @@ define <16 x half> @vfnmacc_vf_v16f16_unmasked(<16 x half> %a, half %b, <16 x ha define <16 x half> @vfnmacc_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfnmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v14, v0 ; CHECK-NEXT: ret %nega = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %a, <16 x i1> splat (i1 -1), i32 %evl) %negc = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -483,9 +583,11 @@ define <16 x half> @vfnmacc_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x h define <16 x half> @vfnmacc_vf_v16f16_ta(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -499,9 +601,11 @@ define <16 x half> @vfnmacc_vf_v16f16_ta(<16 x half> %a, half %b, <16 x half> %c define <16 x half> @vfnmacc_vf_v16f16_commute_ta(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -520,8 +624,12 @@ declare <32 x half> @llvm.vp.select.v32f16(<32 x i1>, <32 x half>, <32 x half>, define <32 x half> @vfnmacc_vv_v32f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfnmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v20, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %a, <32 x i1> splat (i1 -1), i32 %evl) @@ -534,8 +642,12 @@ define <32 x half> @vfnmacc_vv_v32f16(<32 x half> %a, <32 x half> %b, <32 x half define <32 x half> @vfnmacc_vv_v32f16_unmasked(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v32f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfnmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v20 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %a, <32 x i1> splat (i1 -1), i32 %evl) @@ -548,8 +660,12 @@ define <32 x half> @vfnmacc_vv_v32f16_unmasked(<32 x half> %a, <32 x half> %b, < define <32 x half> @vfnmacc_vf_v32f16(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v32f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -564,8 +680,12 @@ define <32 x half> @vfnmacc_vf_v32f16(<32 x half> %a, half %b, <32 x half> %c, < define <32 x half> @vfnmacc_vf_v32f16_commute(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v32f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -580,8 +700,12 @@ define <32 x half> @vfnmacc_vf_v32f16_commute(<32 x half> %a, half %b, <32 x hal define <32 x half> @vfnmacc_vf_v32f16_unmasked(<32 x half> %a, half %b, <32 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v32f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v16 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -596,9 +720,11 @@ define <32 x half> @vfnmacc_vf_v32f16_unmasked(<32 x half> %a, half %b, <32 x ha define <32 x half> @vfnmacc_vv_v32f16_ta(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v32f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfnmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v20, v0 ; CHECK-NEXT: ret %nega = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %a, <32 x i1> splat (i1 -1), i32 %evl) %negc = call <32 x half> @llvm.vp.fneg.v32f16(<32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -610,9 +736,11 @@ define <32 x half> @vfnmacc_vv_v32f16_ta(<32 x half> %a, <32 x half> %b, <32 x h define <32 x half> @vfnmacc_vf_v32f16_ta(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v32f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -626,9 +754,11 @@ define <32 x half> @vfnmacc_vf_v32f16_ta(<32 x half> %a, half %b, <32 x half> %c define <32 x half> @vfnmacc_vf_v32f16_commute_ta(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v32f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -647,8 +777,12 @@ declare <2 x float> @llvm.vp.select.v2f32(<2 x i1>, <2 x float>, <2 x float>, i3 define <2 x float> @vfnmacc_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -661,8 +795,12 @@ define <2 x float> @vfnmacc_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> define <2 x float> @vfnmacc_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -675,8 +813,12 @@ define <2 x float> @vfnmacc_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 define <2 x float> @vfnmacc_vf_v2f32(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -691,8 +833,12 @@ define <2 x float> @vfnmacc_vf_v2f32(<2 x float> %a, float %b, <2 x float> %c, < define <2 x float> @vfnmacc_vf_v2f32_commute(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -707,8 +853,12 @@ define <2 x float> @vfnmacc_vf_v2f32_commute(<2 x float> %a, float %b, <2 x floa define <2 x float> @vfnmacc_vf_v2f32_unmasked(<2 x float> %a, float %b, <2 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -723,9 +873,11 @@ define <2 x float> @vfnmacc_vf_v2f32_unmasked(<2 x float> %a, float %b, <2 x flo define <2 x float> @vfnmacc_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %nega = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %a, <2 x i1> splat (i1 -1), i32 %evl) %negc = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -737,9 +889,11 @@ define <2 x float> @vfnmacc_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x flo define <2 x float> @vfnmacc_vf_v2f32_ta(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -753,9 +907,11 @@ define <2 x float> @vfnmacc_vf_v2f32_ta(<2 x float> %a, float %b, <2 x float> %c define <2 x float> @vfnmacc_vf_v2f32_commute_ta(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -774,8 +930,12 @@ declare <4 x float> @llvm.vp.select.v4f32(<4 x i1>, <4 x float>, <4 x float>, i3 define <4 x float> @vfnmacc_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -788,8 +948,12 @@ define <4 x float> @vfnmacc_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> define <4 x float> @vfnmacc_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -802,8 +966,12 @@ define <4 x float> @vfnmacc_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 define <4 x float> @vfnmacc_vf_v4f32(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -818,8 +986,12 @@ define <4 x float> @vfnmacc_vf_v4f32(<4 x float> %a, float %b, <4 x float> %c, < define <4 x float> @vfnmacc_vf_v4f32_commute(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -834,8 +1006,12 @@ define <4 x float> @vfnmacc_vf_v4f32_commute(<4 x float> %a, float %b, <4 x floa define <4 x float> @vfnmacc_vf_v4f32_unmasked(<4 x float> %a, float %b, <4 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -850,9 +1026,11 @@ define <4 x float> @vfnmacc_vf_v4f32_unmasked(<4 x float> %a, float %b, <4 x flo define <4 x float> @vfnmacc_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %nega = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a, <4 x i1> splat (i1 -1), i32 %evl) %negc = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -864,9 +1042,11 @@ define <4 x float> @vfnmacc_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x flo define <4 x float> @vfnmacc_vf_v4f32_ta(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -880,9 +1060,11 @@ define <4 x float> @vfnmacc_vf_v4f32_ta(<4 x float> %a, float %b, <4 x float> %c define <4 x float> @vfnmacc_vf_v4f32_commute_ta(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -901,8 +1083,12 @@ declare <8 x float> @llvm.vp.select.v8f32(<8 x i1>, <8 x float>, <8 x float>, i3 define <8 x float> @vfnmacc_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfnmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -915,8 +1101,12 @@ define <8 x float> @vfnmacc_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> define <8 x float> @vfnmacc_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfnmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v14 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -929,8 +1119,12 @@ define <8 x float> @vfnmacc_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 define <8 x float> @vfnmacc_vf_v8f32(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -945,8 +1139,12 @@ define <8 x float> @vfnmacc_vf_v8f32(<8 x float> %a, float %b, <8 x float> %c, < define <8 x float> @vfnmacc_vf_v8f32_commute(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -961,8 +1159,12 @@ define <8 x float> @vfnmacc_vf_v8f32_commute(<8 x float> %a, float %b, <8 x floa define <8 x float> @vfnmacc_vf_v8f32_unmasked(<8 x float> %a, float %b, <8 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v12 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -977,9 +1179,11 @@ define <8 x float> @vfnmacc_vf_v8f32_unmasked(<8 x float> %a, float %b, <8 x flo define <8 x float> @vfnmacc_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfnmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v14, v0 ; CHECK-NEXT: ret %nega = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %a, <8 x i1> splat (i1 -1), i32 %evl) %negc = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -991,9 +1195,11 @@ define <8 x float> @vfnmacc_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x flo define <8 x float> @vfnmacc_vf_v8f32_ta(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1007,9 +1213,11 @@ define <8 x float> @vfnmacc_vf_v8f32_ta(<8 x float> %a, float %b, <8 x float> %c define <8 x float> @vfnmacc_vf_v8f32_commute_ta(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1028,8 +1236,12 @@ declare <16 x float> @llvm.vp.select.v16f32(<16 x i1>, <16 x float>, <16 x float define <16 x float> @vfnmacc_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfnmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v20, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -1042,8 +1254,12 @@ define <16 x float> @vfnmacc_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x f define <16 x float> @vfnmacc_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfnmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v20 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -1056,8 +1272,12 @@ define <16 x float> @vfnmacc_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b define <16 x float> @vfnmacc_vf_v16f32(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1072,8 +1292,12 @@ define <16 x float> @vfnmacc_vf_v16f32(<16 x float> %a, float %b, <16 x float> % define <16 x float> @vfnmacc_vf_v16f32_commute(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1088,8 +1312,12 @@ define <16 x float> @vfnmacc_vf_v16f32_commute(<16 x float> %a, float %b, <16 x define <16 x float> @vfnmacc_vf_v16f32_unmasked(<16 x float> %a, float %b, <16 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v16 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1104,9 +1332,11 @@ define <16 x float> @vfnmacc_vf_v16f32_unmasked(<16 x float> %a, float %b, <16 x define <16 x float> @vfnmacc_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfnmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v20, v0 ; CHECK-NEXT: ret %nega = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %a, <16 x i1> splat (i1 -1), i32 %evl) %negc = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -1118,9 +1348,11 @@ define <16 x float> @vfnmacc_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 define <16 x float> @vfnmacc_vf_v16f32_ta(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1134,9 +1366,11 @@ define <16 x float> @vfnmacc_vf_v16f32_ta(<16 x float> %a, float %b, <16 x float define <16 x float> @vfnmacc_vf_v16f32_commute_ta(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v16f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1155,8 +1389,12 @@ declare <2 x double> @llvm.vp.select.v2f64(<2 x i1>, <2 x double>, <2 x double>, define <2 x double> @vfnmacc_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v11, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -1169,8 +1407,12 @@ define <2 x double> @vfnmacc_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x dou define <2 x double> @vfnmacc_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfnmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v11 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -1183,8 +1425,12 @@ define <2 x double> @vfnmacc_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, define <2 x double> @vfnmacc_vf_v2f64(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1199,8 +1445,12 @@ define <2 x double> @vfnmacc_vf_v2f64(<2 x double> %a, double %b, <2 x double> % define <2 x double> @vfnmacc_vf_v2f64_commute(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v10, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1215,8 +1465,12 @@ define <2 x double> @vfnmacc_vf_v2f64_commute(<2 x double> %a, double %b, <2 x d define <2 x double> @vfnmacc_vf_v2f64_unmasked(<2 x double> %a, double %b, <2 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1231,9 +1485,11 @@ define <2 x double> @vfnmacc_vf_v2f64_unmasked(<2 x double> %a, double %b, <2 x define <2 x double> @vfnmacc_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfnmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v11, v10 +; CHECK-NEXT: vfmacc.vv v11, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v11, v0 ; CHECK-NEXT: ret %nega = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %a, <2 x i1> splat (i1 -1), i32 %evl) %negc = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1245,9 +1501,11 @@ define <2 x double> @vfnmacc_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x define <2 x double> @vfnmacc_vf_v2f64_ta(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1261,9 +1519,11 @@ define <2 x double> @vfnmacc_vf_v2f64_ta(<2 x double> %a, double %b, <2 x double define <2 x double> @vfnmacc_vf_v2f64_commute_ta(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v2f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfnmacc.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v10, v9 +; CHECK-NEXT: vfmacc.vf v10, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v10, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1282,8 +1542,12 @@ declare <4 x double> @llvm.vp.select.v4f64(<4 x i1>, <4 x double>, <4 x double>, define <4 x double> @vfnmacc_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfnmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v14, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -1296,8 +1560,12 @@ define <4 x double> @vfnmacc_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x dou define <4 x double> @vfnmacc_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfnmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v14 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -1310,8 +1578,12 @@ define <4 x double> @vfnmacc_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, define <4 x double> @vfnmacc_vf_v4f64(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1326,8 +1598,12 @@ define <4 x double> @vfnmacc_vf_v4f64(<4 x double> %a, double %b, <4 x double> % define <4 x double> @vfnmacc_vf_v4f64_commute(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v12, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1342,8 +1618,12 @@ define <4 x double> @vfnmacc_vf_v4f64_commute(<4 x double> %a, double %b, <4 x d define <4 x double> @vfnmacc_vf_v4f64_unmasked(<4 x double> %a, double %b, <4 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v12 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1358,9 +1638,11 @@ define <4 x double> @vfnmacc_vf_v4f64_unmasked(<4 x double> %a, double %b, <4 x define <4 x double> @vfnmacc_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfnmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v14, v12 +; CHECK-NEXT: vfmacc.vv v14, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v14, v0 ; CHECK-NEXT: ret %nega = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %a, <4 x i1> splat (i1 -1), i32 %evl) %negc = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1372,9 +1654,11 @@ define <4 x double> @vfnmacc_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x define <4 x double> @vfnmacc_vf_v4f64_ta(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1388,9 +1672,11 @@ define <4 x double> @vfnmacc_vf_v4f64_ta(<4 x double> %a, double %b, <4 x double define <4 x double> @vfnmacc_vf_v4f64_commute_ta(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v4f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfnmacc.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v12, v10 +; CHECK-NEXT: vfmacc.vf v12, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v12, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1409,8 +1695,12 @@ declare <8 x double> @llvm.vp.select.v8f64(<8 x i1>, <8 x double>, <8 x double>, define <8 x double> @vfnmacc_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfnmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v20, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -1423,8 +1713,12 @@ define <8 x double> @vfnmacc_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou define <8 x double> @vfnmacc_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfnmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v20 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -1437,8 +1731,12 @@ define <8 x double> @vfnmacc_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, define <8 x double> @vfnmacc_vf_v8f64(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1453,8 +1751,12 @@ define <8 x double> @vfnmacc_vf_v8f64(<8 x double> %a, double %b, <8 x double> % define <8 x double> @vfnmacc_vf_v8f64_commute(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v16, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1469,8 +1771,12 @@ define <8 x double> @vfnmacc_vf_v8f64_commute(<8 x double> %a, double %b, <8 x d define <8 x double> @vfnmacc_vf_v8f64_unmasked(<8 x double> %a, double %b, <8 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v16 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1485,9 +1791,11 @@ define <8 x double> @vfnmacc_vf_v8f64_unmasked(<8 x double> %a, double %b, <8 x define <8 x double> @vfnmacc_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vv_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfnmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v20, v16 +; CHECK-NEXT: vfmacc.vv v20, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v20, v0 ; CHECK-NEXT: ret %nega = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %a, <8 x i1> splat (i1 -1), i32 %evl) %negc = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1499,9 +1807,11 @@ define <8 x double> @vfnmacc_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x define <8 x double> @vfnmacc_vf_v8f64_ta(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -1515,9 +1825,11 @@ define <8 x double> @vfnmacc_vf_v8f64_ta(<8 x double> %a, double %b, <8 x double define <8 x double> @vfnmacc_vf_v8f64_commute_ta(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmacc_vf_v8f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfnmacc.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfneg.v v16, v12 +; CHECK-NEXT: vfmacc.vf v16, fa0, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v16, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll index afc89717596b2..7698210d06fb5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmadd-constrained-sdnode.ll @@ -13,7 +13,9 @@ define <2 x half> @vfnmsub_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x half> % ; CHECK-LABEL: vfnmsub_vv_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v11, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmacc.vv v8, v10, v11 ; CHECK-NEXT: ret %neg = fneg <2 x half> %va %neg2 = fneg <2 x half> %vb @@ -25,7 +27,9 @@ define <2 x half> @vfnmsub_vf_v2f16(<2 x half> %va, <2 x half> %vb, half %c) str ; CHECK-LABEL: vfnmsub_vf_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmacc.vf v8, fa0, v10 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -41,7 +45,9 @@ define <4 x half> @vfnmsub_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x half> % ; CHECK-LABEL: vfnmsub_vv_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %neg = fneg <4 x half> %vb %neg2 = fneg <4 x half> %vc @@ -53,7 +59,10 @@ define <4 x half> @vfnmsub_vf_v4f16(<4 x half> %va, <4 x half> %vb, half %c) str ; CHECK-LABEL: vfnmsub_vf_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -69,7 +78,9 @@ define <8 x half> @vfnmsub_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x half> % ; CHECK-LABEL: vfnmsub_vv_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfnmacc.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <8 x half> %vb %neg2 = fneg <8 x half> %va @@ -81,7 +92,10 @@ define <8 x half> @vfnmsub_vf_v8f16(<8 x half> %va, <8 x half> %vb, half %c) str ; CHECK-LABEL: vfnmsub_vf_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -97,7 +111,9 @@ define <16 x half> @vfnmsub_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x ha ; CHECK-LABEL: vfnmsub_vv_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v12, v10 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %neg = fneg <16 x half> %vc %neg2 = fneg <16 x half> %vb @@ -109,7 +125,10 @@ define <16 x half> @vfnmsub_vf_v16f16(<16 x half> %va, <16 x half> %vb, half %c) ; CHECK-LABEL: vfnmsub_vf_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -126,7 +145,9 @@ define <32 x half> @vfnmsub_vv_v32f16(<32 x half> %va, <32 x half> %vb, <32 x ha ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <32 x half> %vc %neg2 = fneg <32 x half> %vb @@ -139,7 +160,10 @@ define <32 x half> @vfnmsub_vf_v32f16(<32 x half> %va, <32 x half> %vb, half %c) ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfnmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %c, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -155,7 +179,9 @@ define <2 x float> @vfnmsub_vv_v2f32(<2 x float> %va, <2 x float> %vb, <2 x floa ; CHECK-LABEL: vfnmsub_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x float> %vc %neg2 = fneg <2 x float> %vb @@ -167,7 +193,9 @@ define <2 x float> @vfnmsub_vf_v2f32(<2 x float> %va, <2 x float> %vb, float %c) ; CHECK-LABEL: vfnmsub_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmacc.vf v8, fa0, v10 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -183,7 +211,9 @@ define <4 x float> @vfnmsub_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x floa ; CHECK-LABEL: vfnmsub_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v11, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmacc.vv v8, v9, v11 ; CHECK-NEXT: ret %neg = fneg <4 x float> %va %neg2 = fneg <4 x float> %vc @@ -195,7 +225,10 @@ define <4 x float> @vfnmsub_vf_v4f32(<4 x float> %va, <4 x float> %vb, float %c) ; CHECK-LABEL: vfnmsub_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -211,7 +244,9 @@ define <8 x float> @vfnmsub_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x floa ; CHECK-LABEL: vfnmsub_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfnmacc.vv v8, v12, v10 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %neg = fneg <8 x float> %vc %neg2 = fneg <8 x float> %va @@ -223,7 +258,10 @@ define <8 x float> @vfnmsub_vf_v8f32(<8 x float> %va, <8 x float> %vb, float %c) ; CHECK-LABEL: vfnmsub_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfnmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -239,7 +277,9 @@ define <16 x float> @vfnmsub_vv_v16f32(<16 x float> %va, <16 x float> %vb, <16 x ; CHECK-LABEL: vfnmsub_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v20, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmacc.vv v8, v16, v20 ; CHECK-NEXT: ret %neg = fneg <16 x float> %va %neg2 = fneg <16 x float> %vb @@ -251,7 +291,10 @@ define <16 x float> @vfnmsub_vf_v16f32(<16 x float> %va, <16 x float> %vb, float ; CHECK-LABEL: vfnmsub_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %c, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -267,7 +310,9 @@ define <2 x double> @vfnmsub_vv_v2f64(<2 x double> %va, <2 x double> %vb, <2 x d ; CHECK-LABEL: vfnmsub_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v11, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmacc.vv v8, v10, v11 ; CHECK-NEXT: ret %neg = fneg <2 x double> %va %neg2 = fneg <2 x double> %vb @@ -279,7 +324,9 @@ define <2 x double> @vfnmsub_vf_v2f64(<2 x double> %va, <2 x double> %vb, double ; CHECK-LABEL: vfnmsub_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmacc.vf v8, fa0, v10 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %c, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -295,7 +342,9 @@ define <4 x double> @vfnmsub_vv_v4f64(<4 x double> %va, <4 x double> %vb, <4 x d ; CHECK-LABEL: vfnmsub_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfnmadd.vv v8, v10, v12 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 ; CHECK-NEXT: ret %neg = fneg <4 x double> %vb %neg2 = fneg <4 x double> %vc @@ -307,7 +356,10 @@ define <4 x double> @vfnmsub_vf_v4f64(<4 x double> %va, <4 x double> %vb, double ; CHECK-LABEL: vfnmsub_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfnmadd.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %c, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -323,7 +375,9 @@ define <8 x double> @vfnmsub_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x d ; CHECK-LABEL: vfnmsub_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfnmacc.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <8 x double> %vb %neg2 = fneg <8 x double> %va @@ -335,7 +389,10 @@ define <8 x double> @vfnmsub_vf_v8f64(<8 x double> %va, <8 x double> %vb, double ; CHECK-LABEL: vfnmsub_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfnmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %c, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll index 4d9b002cc785c..3b11f99c0fb08 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsac-vp.ll @@ -12,8 +12,11 @@ declare <2 x half> @llvm.vp.select.v2f16(<2 x i1>, <2 x half>, <2 x half>, i32) define <2 x half> @vfnmsac_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -25,8 +28,11 @@ define <2 x half> @vfnmsac_vv_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, define <2 x half> @vfnmsac_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -38,8 +44,11 @@ define <2 x half> @vfnmsac_vv_v2f16_unmasked(<2 x half> %a, <2 x half> %b, <2 x define <2 x half> @vfnmsac_vf_v2f16(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -53,8 +62,11 @@ define <2 x half> @vfnmsac_vf_v2f16(<2 x half> %a, half %b, <2 x half> %c, <2 x define <2 x half> @vfnmsac_vf_v2f16_commute(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -68,8 +80,11 @@ define <2 x half> @vfnmsac_vf_v2f16_commute(<2 x half> %a, half %b, <2 x half> % define <2 x half> @vfnmsac_vf_v2f16_unmasked(<2 x half> %a, half %b, <2 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 @@ -83,9 +98,10 @@ define <2 x half> @vfnmsac_vf_v2f16_unmasked(<2 x half> %a, half %b, <2 x half> define <2 x half> @vfnmsac_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %nega = call <2 x half> @llvm.vp.fneg.v2f16(<2 x half> %a, <2 x i1> splat (i1 -1), i32 %evl) %v = call <2 x half> @llvm.vp.fma.v2f16(<2 x half> %nega, <2 x half> %b, <2 x half> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -96,9 +112,10 @@ define <2 x half> @vfnmsac_vv_v2f16_ta(<2 x half> %a, <2 x half> %b, <2 x half> define <2 x half> @vfnmsac_vf_v2f16_ta(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -111,9 +128,10 @@ define <2 x half> @vfnmsac_vf_v2f16_ta(<2 x half> %a, half %b, <2 x half> %c, <2 define <2 x half> @vfnmsac_vf_v2f16_commute_ta(<2 x half> %a, half %b, <2 x half> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x half> poison, half %b, i32 0 %vb = shufflevector <2 x half> %elt.head, <2 x half> poison, <2 x i32> zeroinitializer @@ -131,8 +149,11 @@ declare <4 x half> @llvm.vp.select.v4f16(<4 x i1>, <4 x half>, <4 x half>, i32) define <4 x half> @vfnmsac_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -144,8 +165,11 @@ define <4 x half> @vfnmsac_vv_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, define <4 x half> @vfnmsac_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -157,8 +181,11 @@ define <4 x half> @vfnmsac_vv_v4f16_unmasked(<4 x half> %a, <4 x half> %b, <4 x define <4 x half> @vfnmsac_vf_v4f16(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -172,8 +199,11 @@ define <4 x half> @vfnmsac_vf_v4f16(<4 x half> %a, half %b, <4 x half> %c, <4 x define <4 x half> @vfnmsac_vf_v4f16_commute(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -187,8 +217,11 @@ define <4 x half> @vfnmsac_vf_v4f16_commute(<4 x half> %a, half %b, <4 x half> % define <4 x half> @vfnmsac_vf_v4f16_unmasked(<4 x half> %a, half %b, <4 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 @@ -202,9 +235,10 @@ define <4 x half> @vfnmsac_vf_v4f16_unmasked(<4 x half> %a, half %b, <4 x half> define <4 x half> @vfnmsac_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %nega = call <4 x half> @llvm.vp.fneg.v4f16(<4 x half> %a, <4 x i1> splat (i1 -1), i32 %evl) %v = call <4 x half> @llvm.vp.fma.v4f16(<4 x half> %nega, <4 x half> %b, <4 x half> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -215,9 +249,10 @@ define <4 x half> @vfnmsac_vv_v4f16_ta(<4 x half> %a, <4 x half> %b, <4 x half> define <4 x half> @vfnmsac_vf_v4f16_ta(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -230,9 +265,10 @@ define <4 x half> @vfnmsac_vf_v4f16_ta(<4 x half> %a, half %b, <4 x half> %c, <4 define <4 x half> @vfnmsac_vf_v4f16_commute_ta(<4 x half> %a, half %b, <4 x half> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x half> poison, half %b, i32 0 %vb = shufflevector <4 x half> %elt.head, <4 x half> poison, <4 x i32> zeroinitializer @@ -250,8 +286,11 @@ declare <8 x half> @llvm.vp.select.v8f16(<8 x i1>, <8 x half>, <8 x half>, i32) define <8 x half> @vfnmsac_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -263,8 +302,11 @@ define <8 x half> @vfnmsac_vv_v8f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, define <8 x half> @vfnmsac_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -276,8 +318,11 @@ define <8 x half> @vfnmsac_vv_v8f16_unmasked(<8 x half> %a, <8 x half> %b, <8 x define <8 x half> @vfnmsac_vf_v8f16(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -291,8 +336,11 @@ define <8 x half> @vfnmsac_vf_v8f16(<8 x half> %a, half %b, <8 x half> %c, <8 x define <8 x half> @vfnmsac_vf_v8f16_commute(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -306,8 +354,11 @@ define <8 x half> @vfnmsac_vf_v8f16_commute(<8 x half> %a, half %b, <8 x half> % define <8 x half> @vfnmsac_vf_v8f16_unmasked(<8 x half> %a, half %b, <8 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 @@ -321,9 +372,10 @@ define <8 x half> @vfnmsac_vf_v8f16_unmasked(<8 x half> %a, half %b, <8 x half> define <8 x half> @vfnmsac_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %nega = call <8 x half> @llvm.vp.fneg.v8f16(<8 x half> %a, <8 x i1> splat (i1 -1), i32 %evl) %v = call <8 x half> @llvm.vp.fma.v8f16(<8 x half> %nega, <8 x half> %b, <8 x half> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -334,9 +386,10 @@ define <8 x half> @vfnmsac_vv_v8f16_ta(<8 x half> %a, <8 x half> %b, <8 x half> define <8 x half> @vfnmsac_vf_v8f16_ta(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -349,9 +402,10 @@ define <8 x half> @vfnmsac_vf_v8f16_ta(<8 x half> %a, half %b, <8 x half> %c, <8 define <8 x half> @vfnmsac_vf_v8f16_commute_ta(<8 x half> %a, half %b, <8 x half> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x half> poison, half %b, i32 0 %vb = shufflevector <8 x half> %elt.head, <8 x half> poison, <8 x i32> zeroinitializer @@ -369,8 +423,11 @@ declare <16 x half> @llvm.vp.select.v16f16(<16 x i1>, <16 x half>, <16 x half>, define <16 x half> @vfnmsac_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -382,8 +439,11 @@ define <16 x half> @vfnmsac_vv_v16f16(<16 x half> %a, <16 x half> %b, <16 x half define <16 x half> @vfnmsac_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -395,8 +455,11 @@ define <16 x half> @vfnmsac_vv_v16f16_unmasked(<16 x half> %a, <16 x half> %b, < define <16 x half> @vfnmsac_vf_v16f16(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -410,8 +473,11 @@ define <16 x half> @vfnmsac_vf_v16f16(<16 x half> %a, half %b, <16 x half> %c, < define <16 x half> @vfnmsac_vf_v16f16_commute(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -425,8 +491,11 @@ define <16 x half> @vfnmsac_vf_v16f16_commute(<16 x half> %a, half %b, <16 x hal define <16 x half> @vfnmsac_vf_v16f16_unmasked(<16 x half> %a, half %b, <16 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 @@ -440,9 +509,10 @@ define <16 x half> @vfnmsac_vf_v16f16_unmasked(<16 x half> %a, half %b, <16 x ha define <16 x half> @vfnmsac_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %nega = call <16 x half> @llvm.vp.fneg.v16f16(<16 x half> %a, <16 x i1> splat (i1 -1), i32 %evl) %v = call <16 x half> @llvm.vp.fma.v16f16(<16 x half> %nega, <16 x half> %b, <16 x half> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -453,9 +523,10 @@ define <16 x half> @vfnmsac_vv_v16f16_ta(<16 x half> %a, <16 x half> %b, <16 x h define <16 x half> @vfnmsac_vf_v16f16_ta(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -468,9 +539,10 @@ define <16 x half> @vfnmsac_vf_v16f16_ta(<16 x half> %a, half %b, <16 x half> %c define <16 x half> @vfnmsac_vf_v16f16_commute_ta(<16 x half> %a, half %b, <16 x half> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x half> poison, half %b, i32 0 %vb = shufflevector <16 x half> %elt.head, <16 x half> poison, <16 x i32> zeroinitializer @@ -488,8 +560,11 @@ declare <32 x half> @llvm.vp.select.v26f16(<32 x i1>, <32 x half>, <32 x half>, define <32 x half> @vfnmsac_vv_v26f16(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v26f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <32 x half> @llvm.vp.fneg.v26f16(<32 x half> %a, <32 x i1> splat (i1 -1), i32 %evl) @@ -501,8 +576,11 @@ define <32 x half> @vfnmsac_vv_v26f16(<32 x half> %a, <32 x half> %b, <32 x half define <32 x half> @vfnmsac_vv_v26f16_unmasked(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v26f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <32 x half> @llvm.vp.fneg.v26f16(<32 x half> %a, <32 x i1> splat (i1 -1), i32 %evl) @@ -514,8 +592,11 @@ define <32 x half> @vfnmsac_vv_v26f16_unmasked(<32 x half> %a, <32 x half> %b, < define <32 x half> @vfnmsac_vf_v26f16(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v26f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -529,8 +610,11 @@ define <32 x half> @vfnmsac_vf_v26f16(<32 x half> %a, half %b, <32 x half> %c, < define <32 x half> @vfnmsac_vf_v26f16_commute(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v26f16_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -544,8 +628,11 @@ define <32 x half> @vfnmsac_vf_v26f16_commute(<32 x half> %a, half %b, <32 x hal define <32 x half> @vfnmsac_vf_v26f16_unmasked(<32 x half> %a, half %b, <32 x half> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v26f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 @@ -559,9 +646,10 @@ define <32 x half> @vfnmsac_vf_v26f16_unmasked(<32 x half> %a, half %b, <32 x ha define <32 x half> @vfnmsac_vv_v26f16_ta(<32 x half> %a, <32 x half> %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v26f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %nega = call <32 x half> @llvm.vp.fneg.v26f16(<32 x half> %a, <32 x i1> splat (i1 -1), i32 %evl) %v = call <32 x half> @llvm.vp.fma.v26f16(<32 x half> %nega, <32 x half> %b, <32 x half> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -572,9 +660,10 @@ define <32 x half> @vfnmsac_vv_v26f16_ta(<32 x half> %a, <32 x half> %b, <32 x h define <32 x half> @vfnmsac_vf_v26f16_ta(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v26f16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -587,9 +676,10 @@ define <32 x half> @vfnmsac_vf_v26f16_ta(<32 x half> %a, half %b, <32 x half> %c define <32 x half> @vfnmsac_vf_v26f16_commute_ta(<32 x half> %a, half %b, <32 x half> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v26f16_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x half> poison, half %b, i32 0 %vb = shufflevector <32 x half> %elt.head, <32 x half> poison, <32 x i32> zeroinitializer @@ -607,8 +697,11 @@ declare <2 x float> @llvm.vp.select.v2f32(<2 x i1>, <2 x float>, <2 x float>, i3 define <2 x float> @vfnmsac_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -620,8 +713,11 @@ define <2 x float> @vfnmsac_vv_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> define <2 x float> @vfnmsac_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -633,8 +729,11 @@ define <2 x float> @vfnmsac_vv_v2f32_unmasked(<2 x float> %a, <2 x float> %b, <2 define <2 x float> @vfnmsac_vf_v2f32(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -648,8 +747,11 @@ define <2 x float> @vfnmsac_vf_v2f32(<2 x float> %a, float %b, <2 x float> %c, < define <2 x float> @vfnmsac_vf_v2f32_commute(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -663,8 +765,11 @@ define <2 x float> @vfnmsac_vf_v2f32_commute(<2 x float> %a, float %b, <2 x floa define <2 x float> @vfnmsac_vf_v2f32_unmasked(<2 x float> %a, float %b, <2 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 @@ -678,9 +783,10 @@ define <2 x float> @vfnmsac_vf_v2f32_unmasked(<2 x float> %a, float %b, <2 x flo define <2 x float> @vfnmsac_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %nega = call <2 x float> @llvm.vp.fneg.v2f32(<2 x float> %a, <2 x i1> splat (i1 -1), i32 %evl) %v = call <2 x float> @llvm.vp.fma.v2f32(<2 x float> %nega, <2 x float> %b, <2 x float> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -691,9 +797,10 @@ define <2 x float> @vfnmsac_vv_v2f32_ta(<2 x float> %a, <2 x float> %b, <2 x flo define <2 x float> @vfnmsac_vf_v2f32_ta(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -706,9 +813,10 @@ define <2 x float> @vfnmsac_vf_v2f32_ta(<2 x float> %a, float %b, <2 x float> %c define <2 x float> @vfnmsac_vf_v2f32_commute_ta(<2 x float> %a, float %b, <2 x float> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x float> poison, float %b, i32 0 %vb = shufflevector <2 x float> %elt.head, <2 x float> poison, <2 x i32> zeroinitializer @@ -726,8 +834,11 @@ declare <4 x float> @llvm.vp.select.v4f32(<4 x i1>, <4 x float>, <4 x float>, i3 define <4 x float> @vfnmsac_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -739,8 +850,11 @@ define <4 x float> @vfnmsac_vv_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> define <4 x float> @vfnmsac_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -752,8 +866,11 @@ define <4 x float> @vfnmsac_vv_v4f32_unmasked(<4 x float> %a, <4 x float> %b, <4 define <4 x float> @vfnmsac_vf_v4f32(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -767,8 +884,11 @@ define <4 x float> @vfnmsac_vf_v4f32(<4 x float> %a, float %b, <4 x float> %c, < define <4 x float> @vfnmsac_vf_v4f32_commute(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -782,8 +902,11 @@ define <4 x float> @vfnmsac_vf_v4f32_commute(<4 x float> %a, float %b, <4 x floa define <4 x float> @vfnmsac_vf_v4f32_unmasked(<4 x float> %a, float %b, <4 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 @@ -797,9 +920,10 @@ define <4 x float> @vfnmsac_vf_v4f32_unmasked(<4 x float> %a, float %b, <4 x flo define <4 x float> @vfnmsac_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %nega = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a, <4 x i1> splat (i1 -1), i32 %evl) %v = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %nega, <4 x float> %b, <4 x float> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -810,9 +934,10 @@ define <4 x float> @vfnmsac_vv_v4f32_ta(<4 x float> %a, <4 x float> %b, <4 x flo define <4 x float> @vfnmsac_vf_v4f32_ta(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -825,9 +950,10 @@ define <4 x float> @vfnmsac_vf_v4f32_ta(<4 x float> %a, float %b, <4 x float> %c define <4 x float> @vfnmsac_vf_v4f32_commute_ta(<4 x float> %a, float %b, <4 x float> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x float> poison, float %b, i32 0 %vb = shufflevector <4 x float> %elt.head, <4 x float> poison, <4 x i32> zeroinitializer @@ -845,8 +971,11 @@ declare <8 x float> @llvm.vp.select.v8f32(<8 x i1>, <8 x float>, <8 x float>, i3 define <8 x float> @vfnmsac_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -858,8 +987,11 @@ define <8 x float> @vfnmsac_vv_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> define <8 x float> @vfnmsac_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -871,8 +1003,11 @@ define <8 x float> @vfnmsac_vv_v8f32_unmasked(<8 x float> %a, <8 x float> %b, <8 define <8 x float> @vfnmsac_vf_v8f32(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -886,8 +1021,11 @@ define <8 x float> @vfnmsac_vf_v8f32(<8 x float> %a, float %b, <8 x float> %c, < define <8 x float> @vfnmsac_vf_v8f32_commute(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -901,8 +1039,11 @@ define <8 x float> @vfnmsac_vf_v8f32_commute(<8 x float> %a, float %b, <8 x floa define <8 x float> @vfnmsac_vf_v8f32_unmasked(<8 x float> %a, float %b, <8 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 @@ -916,9 +1057,10 @@ define <8 x float> @vfnmsac_vf_v8f32_unmasked(<8 x float> %a, float %b, <8 x flo define <8 x float> @vfnmsac_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %nega = call <8 x float> @llvm.vp.fneg.v8f32(<8 x float> %a, <8 x i1> splat (i1 -1), i32 %evl) %v = call <8 x float> @llvm.vp.fma.v8f32(<8 x float> %nega, <8 x float> %b, <8 x float> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -929,9 +1071,10 @@ define <8 x float> @vfnmsac_vv_v8f32_ta(<8 x float> %a, <8 x float> %b, <8 x flo define <8 x float> @vfnmsac_vf_v8f32_ta(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -944,9 +1087,10 @@ define <8 x float> @vfnmsac_vf_v8f32_ta(<8 x float> %a, float %b, <8 x float> %c define <8 x float> @vfnmsac_vf_v8f32_commute_ta(<8 x float> %a, float %b, <8 x float> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x float> poison, float %b, i32 0 %vb = shufflevector <8 x float> %elt.head, <8 x float> poison, <8 x i32> zeroinitializer @@ -964,8 +1108,11 @@ declare <16 x float> @llvm.vp.select.v16f32(<16 x i1>, <16 x float>, <16 x float define <16 x float> @vfnmsac_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -977,8 +1124,11 @@ define <16 x float> @vfnmsac_vv_v16f32(<16 x float> %a, <16 x float> %b, <16 x f define <16 x float> @vfnmsac_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %a, <16 x i1> splat (i1 -1), i32 %evl) @@ -990,8 +1140,11 @@ define <16 x float> @vfnmsac_vv_v16f32_unmasked(<16 x float> %a, <16 x float> %b define <16 x float> @vfnmsac_vf_v16f32(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1005,8 +1158,11 @@ define <16 x float> @vfnmsac_vf_v16f32(<16 x float> %a, float %b, <16 x float> % define <16 x float> @vfnmsac_vf_v16f32_commute(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f32_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1020,8 +1176,11 @@ define <16 x float> @vfnmsac_vf_v16f32_commute(<16 x float> %a, float %b, <16 x define <16 x float> @vfnmsac_vf_v16f32_unmasked(<16 x float> %a, float %b, <16 x float> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 @@ -1035,9 +1194,10 @@ define <16 x float> @vfnmsac_vf_v16f32_unmasked(<16 x float> %a, float %b, <16 x define <16 x float> @vfnmsac_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %nega = call <16 x float> @llvm.vp.fneg.v16f32(<16 x float> %a, <16 x i1> splat (i1 -1), i32 %evl) %v = call <16 x float> @llvm.vp.fma.v16f32(<16 x float> %nega, <16 x float> %b, <16 x float> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -1048,9 +1208,10 @@ define <16 x float> @vfnmsac_vv_v16f32_ta(<16 x float> %a, <16 x float> %b, <16 define <16 x float> @vfnmsac_vf_v16f32_ta(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1063,9 +1224,10 @@ define <16 x float> @vfnmsac_vf_v16f32_ta(<16 x float> %a, float %b, <16 x float define <16 x float> @vfnmsac_vf_v16f32_commute_ta(<16 x float> %a, float %b, <16 x float> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v16f32_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x float> poison, float %b, i32 0 %vb = shufflevector <16 x float> %elt.head, <16 x float> poison, <16 x i32> zeroinitializer @@ -1083,8 +1245,11 @@ declare <2 x double> @llvm.vp.select.v2f64(<2 x i1>, <2 x double>, <2 x double>, define <2 x double> @vfnmsac_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -1096,8 +1261,11 @@ define <2 x double> @vfnmsac_vv_v2f64(<2 x double> %a, <2 x double> %b, <2 x dou define <2 x double> @vfnmsac_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %nega = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %a, <2 x i1> splat (i1 -1), i32 %evl) @@ -1109,8 +1277,11 @@ define <2 x double> @vfnmsac_vv_v2f64_unmasked(<2 x double> %a, <2 x double> %b, define <2 x double> @vfnmsac_vf_v2f64(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1124,8 +1295,11 @@ define <2 x double> @vfnmsac_vf_v2f64(<2 x double> %a, double %b, <2 x double> % define <2 x double> @vfnmsac_vf_v2f64_commute(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1139,8 +1313,11 @@ define <2 x double> @vfnmsac_vf_v2f64_commute(<2 x double> %a, double %b, <2 x d define <2 x double> @vfnmsac_vf_v2f64_unmasked(<2 x double> %a, double %b, <2 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 @@ -1154,9 +1331,10 @@ define <2 x double> @vfnmsac_vf_v2f64_unmasked(<2 x double> %a, double %b, <2 x define <2 x double> @vfnmsac_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %nega = call <2 x double> @llvm.vp.fneg.v2f64(<2 x double> %a, <2 x i1> splat (i1 -1), i32 %evl) %v = call <2 x double> @llvm.vp.fma.v2f64(<2 x double> %nega, <2 x double> %b, <2 x double> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1167,9 +1345,10 @@ define <2 x double> @vfnmsac_vv_v2f64_ta(<2 x double> %a, <2 x double> %b, <2 x define <2 x double> @vfnmsac_vf_v2f64_ta(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1182,9 +1361,10 @@ define <2 x double> @vfnmsac_vf_v2f64_ta(<2 x double> %a, double %b, <2 x double define <2 x double> @vfnmsac_vf_v2f64_commute_ta(<2 x double> %a, double %b, <2 x double> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v2f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vfnmsac.vf v9, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x double> poison, double %b, i32 0 %vb = shufflevector <2 x double> %elt.head, <2 x double> poison, <2 x i32> zeroinitializer @@ -1202,8 +1382,11 @@ declare <4 x double> @llvm.vp.select.v4f64(<4 x i1>, <4 x double>, <4 x double>, define <4 x double> @vfnmsac_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -1215,8 +1398,11 @@ define <4 x double> @vfnmsac_vv_v4f64(<4 x double> %a, <4 x double> %b, <4 x dou define <4 x double> @vfnmsac_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %nega = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %a, <4 x i1> splat (i1 -1), i32 %evl) @@ -1228,8 +1414,11 @@ define <4 x double> @vfnmsac_vv_v4f64_unmasked(<4 x double> %a, <4 x double> %b, define <4 x double> @vfnmsac_vf_v4f64(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1243,8 +1432,11 @@ define <4 x double> @vfnmsac_vf_v4f64(<4 x double> %a, double %b, <4 x double> % define <4 x double> @vfnmsac_vf_v4f64_commute(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1258,8 +1450,11 @@ define <4 x double> @vfnmsac_vf_v4f64_commute(<4 x double> %a, double %b, <4 x d define <4 x double> @vfnmsac_vf_v4f64_unmasked(<4 x double> %a, double %b, <4 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 @@ -1273,9 +1468,10 @@ define <4 x double> @vfnmsac_vf_v4f64_unmasked(<4 x double> %a, double %b, <4 x define <4 x double> @vfnmsac_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %nega = call <4 x double> @llvm.vp.fneg.v4f64(<4 x double> %a, <4 x i1> splat (i1 -1), i32 %evl) %v = call <4 x double> @llvm.vp.fma.v4f64(<4 x double> %nega, <4 x double> %b, <4 x double> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1286,9 +1482,10 @@ define <4 x double> @vfnmsac_vv_v4f64_ta(<4 x double> %a, <4 x double> %b, <4 x define <4 x double> @vfnmsac_vf_v4f64_ta(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1301,9 +1498,10 @@ define <4 x double> @vfnmsac_vf_v4f64_ta(<4 x double> %a, double %b, <4 x double define <4 x double> @vfnmsac_vf_v4f64_commute_ta(<4 x double> %a, double %b, <4 x double> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v4f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vfnmsac.vf v10, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x double> poison, double %b, i32 0 %vb = shufflevector <4 x double> %elt.head, <4 x double> poison, <4 x i32> zeroinitializer @@ -1321,8 +1519,11 @@ declare <8 x double> @llvm.vp.select.v8f64(<8 x i1>, <8 x double>, <8 x double>, define <8 x double> @vfnmsac_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -1334,8 +1535,11 @@ define <8 x double> @vfnmsac_vv_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou define <8 x double> @vfnmsac_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %nega = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %a, <8 x i1> splat (i1 -1), i32 %evl) @@ -1347,8 +1551,11 @@ define <8 x double> @vfnmsac_vv_v8f64_unmasked(<8 x double> %a, <8 x double> %b, define <8 x double> @vfnmsac_vf_v8f64(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1362,8 +1569,11 @@ define <8 x double> @vfnmsac_vf_v8f64(<8 x double> %a, double %b, <8 x double> % define <8 x double> @vfnmsac_vf_v8f64_commute(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f64_commute: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1377,8 +1587,11 @@ define <8 x double> @vfnmsac_vf_v8f64_commute(<8 x double> %a, double %b, <8 x d define <8 x double> @vfnmsac_vf_v8f64_unmasked(<8 x double> %a, double %b, <8 x double> %c, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 @@ -1392,9 +1605,10 @@ define <8 x double> @vfnmsac_vf_v8f64_unmasked(<8 x double> %a, double %b, <8 x define <8 x double> @vfnmsac_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vv_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %nega = call <8 x double> @llvm.vp.fneg.v8f64(<8 x double> %a, <8 x i1> splat (i1 -1), i32 %evl) %v = call <8 x double> @llvm.vp.fma.v8f64(<8 x double> %nega, <8 x double> %b, <8 x double> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1405,9 +1619,10 @@ define <8 x double> @vfnmsac_vv_v8f64_ta(<8 x double> %a, <8 x double> %b, <8 x define <8 x double> @vfnmsac_vf_v8f64_ta(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer @@ -1420,9 +1635,10 @@ define <8 x double> @vfnmsac_vf_v8f64_ta(<8 x double> %a, double %b, <8 x double define <8 x double> @vfnmsac_vf_v8f64_commute_ta(<8 x double> %a, double %b, <8 x double> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vfnmsac_vf_v8f64_commute_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vfnmsac.vf v12, fa0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x double> poison, double %b, i32 0 %vb = shufflevector <8 x double> %elt.head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll index d9863bb36c739..4698022d2372c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfnmsub-constrained-sdnode.ll @@ -13,7 +13,8 @@ define <2 x half> @vfnmsub_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x half> % ; CHECK-LABEL: vfnmsub_vv_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x half> %va %vd = call <2 x half> @llvm.experimental.constrained.fma.v2f16(<2 x half> %neg, <2 x half> %vc, <2 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -24,7 +25,8 @@ define <2 x half> @vfnmsub_vf_v2f16(<2 x half> %va, <2 x half> %vb, half %c) str ; CHECK-LABEL: vfnmsub_vf_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -39,7 +41,8 @@ define <4 x half> @vfnmsub_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x half> % ; CHECK-LABEL: vfnmsub_vv_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %neg = fneg <4 x half> %vb %vd = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %neg, <4 x half> %va, <4 x half> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -50,7 +53,9 @@ define <4 x half> @vfnmsub_vf_v4f16(<4 x half> %va, <4 x half> %vb, half %c) str ; CHECK-LABEL: vfnmsub_vf_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -65,7 +70,8 @@ define <8 x half> @vfnmsub_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x half> % ; CHECK-LABEL: vfnmsub_vv_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfnmsac.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <8 x half> %vb %vd = call <8 x half> @llvm.experimental.constrained.fma.v8f16(<8 x half> %neg, <8 x half> %vc, <8 x half> %va, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -76,7 +82,9 @@ define <8 x half> @vfnmsub_vf_v8f16(<8 x half> %va, <8 x half> %vb, half %c) str ; CHECK-LABEL: vfnmsub_vf_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -91,7 +99,8 @@ define <16 x half> @vfnmsub_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x ha ; CHECK-LABEL: vfnmsub_vv_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v12, v10 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %neg = fneg <16 x half> %vc %vd = call <16 x half> @llvm.experimental.constrained.fma.v16f16(<16 x half> %neg, <16 x half> %va, <16 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -102,7 +111,9 @@ define <16 x half> @vfnmsub_vf_v16f16(<16 x half> %va, <16 x half> %vb, half %c) ; CHECK-LABEL: vfnmsub_vf_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -118,7 +129,8 @@ define <32 x half> @vfnmsub_vv_v32f16(<32 x half> %va, <32 x half> %vb, <32 x ha ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <32 x half> %vc %vd = call <32 x half> @llvm.experimental.constrained.fma.v32f16(<32 x half> %neg, <32 x half> %va, <32 x half> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -130,7 +142,9 @@ define <32 x half> @vfnmsub_vf_v32f16(<32 x half> %va, <32 x half> %vb, half %c) ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfnmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <32 x half> poison, half %c, i32 0 %splat = shufflevector <32 x half> %head, <32 x half> poison, <32 x i32> zeroinitializer @@ -145,7 +159,8 @@ define <2 x float> @vfnmsub_vv_v2f32(<2 x float> %va, <2 x float> %vb, <2 x floa ; CHECK-LABEL: vfnmsub_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x float> %vc %vd = call <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float> %va, <2 x float> %neg, <2 x float> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -156,7 +171,8 @@ define <2 x float> @vfnmsub_vf_v2f32(<2 x float> %va, <2 x float> %vb, float %c) ; CHECK-LABEL: vfnmsub_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -171,7 +187,8 @@ define <4 x float> @vfnmsub_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x floa ; CHECK-LABEL: vfnmsub_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v9, v10 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %neg = fneg <4 x float> %va %vd = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %vb, <4 x float> %neg, <4 x float> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -182,7 +199,9 @@ define <4 x float> @vfnmsub_vf_v4f32(<4 x float> %va, <4 x float> %vb, float %c) ; CHECK-LABEL: vfnmsub_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -197,7 +216,8 @@ define <8 x float> @vfnmsub_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x floa ; CHECK-LABEL: vfnmsub_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfnmsac.vv v8, v12, v10 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %neg = fneg <8 x float> %vc %vd = call <8 x float> @llvm.experimental.constrained.fma.v8f32(<8 x float> %vb, <8 x float> %neg, <8 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -208,7 +228,9 @@ define <8 x float> @vfnmsub_vf_v8f32(<8 x float> %va, <8 x float> %vb, float %c) ; CHECK-LABEL: vfnmsub_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfnmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -223,7 +245,8 @@ define <16 x float> @vfnmsub_vv_v16f32(<16 x float> %va, <16 x float> %vb, <16 x ; CHECK-LABEL: vfnmsub_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <16 x float> %va %vd = call <16 x float> @llvm.experimental.constrained.fma.v16f32(<16 x float> %vc, <16 x float> %neg, <16 x float> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -234,7 +257,9 @@ define <16 x float> @vfnmsub_vf_v16f32(<16 x float> %va, <16 x float> %vb, float ; CHECK-LABEL: vfnmsub_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <16 x float> poison, float %c, i32 0 %splat = shufflevector <16 x float> %head, <16 x float> poison, <16 x i32> zeroinitializer @@ -249,7 +274,8 @@ define <2 x double> @vfnmsub_vv_v2f64(<2 x double> %va, <2 x double> %vb, <2 x d ; CHECK-LABEL: vfnmsub_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v10, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %neg = fneg <2 x double> %va %vd = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %neg, <2 x double> %vc, <2 x double> %vb, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -260,7 +286,8 @@ define <2 x double> @vfnmsub_vf_v2f64(<2 x double> %va, <2 x double> %vb, double ; CHECK-LABEL: vfnmsub_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v9 +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vf v8, fa0, v9 ; CHECK-NEXT: ret %head = insertelement <2 x double> poison, double %c, i32 0 %splat = shufflevector <2 x double> %head, <2 x double> poison, <2 x i32> zeroinitializer @@ -275,7 +302,8 @@ define <4 x double> @vfnmsub_vv_v4f64(<4 x double> %va, <4 x double> %vb, <4 x d ; CHECK-LABEL: vfnmsub_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfnmsub.vv v8, v10, v12 +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmadd.vv v8, v10, v12 ; CHECK-NEXT: ret %neg = fneg <4 x double> %vb %vd = call <4 x double> @llvm.experimental.constrained.fma.v4f64(<4 x double> %neg, <4 x double> %va, <4 x double> %vc, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -286,7 +314,9 @@ define <4 x double> @vfnmsub_vf_v4f64(<4 x double> %va, <4 x double> %vb, double ; CHECK-LABEL: vfnmsub_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfnmsub.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x double> poison, double %c, i32 0 %splat = shufflevector <4 x double> %head, <4 x double> poison, <4 x i32> zeroinitializer @@ -301,7 +331,8 @@ define <8 x double> @vfnmsub_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x d ; CHECK-LABEL: vfnmsub_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfnmsac.vv v8, v16, v12 +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %neg = fneg <8 x double> %vb %vd = call <8 x double> @llvm.experimental.constrained.fma.v8f64(<8 x double> %neg, <8 x double> %vc, <8 x double> %va, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -312,7 +343,9 @@ define <8 x double> @vfnmsub_vf_v8f64(<8 x double> %va, <8 x double> %vb, double ; CHECK-LABEL: vfnmsub_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfnmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <8 x double> poison, double %c, i32 0 %splat = shufflevector <8 x double> %head, <8 x double> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index aba9056c78cda..9555f6bfba94f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -23,31 +23,20 @@ define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, ; NO_FOLDING-NEXT: vse32.v v8, (a2) ; NO_FOLDING-NEXT: ret ; -; ZVFH-LABEL: vfwmul_v2f116_multiple_users: -; ZVFH: # %bb.0: -; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFH-NEXT: vfwmul.vv v11, v8, v9 -; ZVFH-NEXT: vfwadd.vv v12, v8, v10 -; ZVFH-NEXT: vfwsub.vv v8, v9, v10 -; ZVFH-NEXT: vse32.v v11, (a0) -; ZVFH-NEXT: vse32.v v12, (a1) -; ZVFH-NEXT: vse32.v v8, (a2) -; ZVFH-NEXT: ret -; -; ZVFHMIN-LABEL: vfwmul_v2f116_multiple_users: -; ZVFHMIN: # %bb.0: -; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v8 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v9 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vfmul.vv v10, v11, v8 -; ZVFHMIN-NEXT: vfadd.vv v11, v11, v9 -; ZVFHMIN-NEXT: vfsub.vv v8, v8, v9 -; ZVFHMIN-NEXT: vse32.v v10, (a0) -; ZVFHMIN-NEXT: vse32.v v11, (a1) -; ZVFHMIN-NEXT: vse32.v v8, (a2) -; ZVFHMIN-NEXT: ret +; FOLDING-LABEL: vfwmul_v2f116_multiple_users: +; FOLDING: # %bb.0: +; FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; FOLDING-NEXT: vfwcvt.f.f.v v11, v8 +; FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; FOLDING-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; FOLDING-NEXT: vfmul.vv v10, v11, v8 +; FOLDING-NEXT: vfadd.vv v11, v11, v9 +; FOLDING-NEXT: vfsub.vv v8, v8, v9 +; FOLDING-NEXT: vse32.v v10, (a0) +; FOLDING-NEXT: vse32.v v11, (a1) +; FOLDING-NEXT: vse32.v v8, (a2) +; FOLDING-NEXT: ret %c = fpext <2 x half> %a to <2 x float> %d = fpext <2 x half> %b to <2 x float> %d2 = fpext <2 x half> %b2 to <2 x float> @@ -79,11 +68,15 @@ define void @vfwmul_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a, ; FOLDING-LABEL: vfwmul_v2f32_multiple_users: ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; FOLDING-NEXT: vfwmul.vv v11, v8, v9 -; FOLDING-NEXT: vfwadd.vv v12, v8, v10 -; FOLDING-NEXT: vfwsub.vv v8, v9, v10 -; FOLDING-NEXT: vse64.v v11, (a0) -; FOLDING-NEXT: vse64.v v12, (a1) +; FOLDING-NEXT: vfwcvt.f.f.v v11, v8 +; FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; FOLDING-NEXT: vfmul.vv v10, v11, v8 +; FOLDING-NEXT: vfadd.vv v11, v11, v9 +; FOLDING-NEXT: vfsub.vv v8, v8, v9 +; FOLDING-NEXT: vse64.v v10, (a0) +; FOLDING-NEXT: vse64.v v11, (a1) ; FOLDING-NEXT: vse64.v v8, (a2) ; FOLDING-NEXT: ret %c = fpext <2 x float> %a to <2 x double> @@ -117,12 +110,16 @@ define void @vfwmacc_v2f32_multiple_users(ptr %x, ptr %y, ptr %z, <2 x float> %a ; FOLDING-LABEL: vfwmacc_v2f32_multiple_users: ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; FOLDING-NEXT: vfwmul.vv v12, v8, v9 -; FOLDING-NEXT: vfwsub.vv v13, v9, v10 -; FOLDING-NEXT: vfwmacc.vv v11, v8, v10 -; FOLDING-NEXT: vse64.v v12, (a0) -; FOLDING-NEXT: vse64.v v11, (a1) -; FOLDING-NEXT: vse64.v v13, (a2) +; FOLDING-NEXT: vfwcvt.f.f.v v12, v8 +; FOLDING-NEXT: vfwcvt.f.f.v v8, v9 +; FOLDING-NEXT: vfwcvt.f.f.v v9, v10 +; FOLDING-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; FOLDING-NEXT: vfmul.vv v10, v12, v8 +; FOLDING-NEXT: vfmadd.vv v12, v9, v11 +; FOLDING-NEXT: vfsub.vv v8, v8, v9 +; FOLDING-NEXT: vse64.v v10, (a0) +; FOLDING-NEXT: vse64.v v12, (a1) +; FOLDING-NEXT: vse64.v v8, (a2) ; FOLDING-NEXT: ret %c = fpext <2 x float> %a to <2 x double> %d = fpext <2 x float> %b to <2 x double> @@ -206,3 +203,6 @@ define void @vfwmacc_v2f32_addend_user(ptr %x, <2 x float> %a, <2 x float> %b) { store <2 x double> %f, ptr %x ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ZVFH: {{.*}} +; ZVFHMIN: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll index ebb920f0ac42e..3795c99613f25 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll @@ -8,9 +8,12 @@ define <2 x float> @vfwadd_v2f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwadd.vv v8, v9, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = load <2 x half>, ptr %y @@ -24,9 +27,12 @@ define <4 x float> @vfwadd_v4f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwadd.vv v8, v9, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b = load <4 x half>, ptr %y @@ -41,8 +47,11 @@ define <8 x float> @vfwadd_v8f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vfwadd.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -57,8 +66,11 @@ define <16 x float> @vfwadd_v16f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vfwadd.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x half>, ptr %x %b = load <16 x half>, ptr %y @@ -74,8 +86,11 @@ define <32 x float> @vfwadd_v32f16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vfwadd.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x half>, ptr %x %b = load <32 x half>, ptr %y @@ -97,18 +112,27 @@ define <64 x float> @vfwadd_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwadd.vv v8, v16, v24 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwadd.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v0 +; CHECK-NEXT: vfadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -128,9 +152,12 @@ define <2 x double> @vfwadd_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vfwadd.vv v8, v9, v10 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = load <2 x float>, ptr %y @@ -145,8 +172,11 @@ define <4 x double> @vfwadd_v4f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vfwadd.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -161,8 +191,11 @@ define <8 x double> @vfwadd_v8f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vfwadd.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = load <8 x float>, ptr %y @@ -177,8 +210,11 @@ define <16 x double> @vfwadd_v16f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vfwadd.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = load <16 x float>, ptr %y @@ -200,16 +236,25 @@ define <32 x double> @vfwadd_v32f32(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwadd.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwadd.vv v16, v24, v0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v0 +; CHECK-NEXT: vfadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -229,8 +274,12 @@ define <2 x float> @vfwadd_vf_v2f16(ptr %x, half %y) { ; CHECK-LABEL: vfwadd_vf_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwadd.vf v8, v9, fa0 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = insertelement <2 x half> poison, half %y, i32 0 @@ -245,8 +294,12 @@ define <4 x float> @vfwadd_vf_v4f16(ptr %x, half %y) { ; CHECK-LABEL: vfwadd_vf_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwadd.vf v8, v9, fa0 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b = insertelement <4 x half> poison, half %y, i32 0 @@ -262,7 +315,11 @@ define <8 x float> @vfwadd_vf_v8f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwadd.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 @@ -278,7 +335,11 @@ define <16 x float> @vfwadd_vf_v16f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vfwadd.vf v8, v12, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x half>, ptr %x %b = insertelement <16 x half> poison, half %y, i32 0 @@ -295,7 +356,11 @@ define <32 x float> @vfwadd_vf_v32f16(ptr %x, half %y) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vfwadd.vf v8, v16, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x half>, ptr %x %b = insertelement <32 x half> poison, half %y, i32 0 @@ -310,8 +375,12 @@ define <2 x double> @vfwadd_vf_v2f32(ptr %x, float %y) { ; CHECK-LABEL: vfwadd_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vfwadd.vf v8, v9, fa0 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = insertelement <2 x float> poison, float %y, i32 0 @@ -327,7 +396,11 @@ define <4 x double> @vfwadd_vf_v4f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vfwadd.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = insertelement <4 x float> poison, float %y, i32 0 @@ -343,7 +416,11 @@ define <8 x double> @vfwadd_vf_v8f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vfwadd.vf v8, v12, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = insertelement <8 x float> poison, float %y, i32 0 @@ -359,7 +436,11 @@ define <16 x double> @vfwadd_vf_v16f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vfwadd.vf v8, v16, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = insertelement <16 x float> poison, float %y, i32 0 @@ -373,14 +454,35 @@ define <16 x double> @vfwadd_vf_v16f32(ptr %x, float %y) { define <32 x double> @vfwadd_vf_v32f32(ptr %x, float %y) { ; CHECK-LABEL: vfwadd_vf_v32f32: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwadd.vf v8, v16, fa0 -; CHECK-NEXT: vfwadd.vf v16, v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v0, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfadd.vv v16, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 @@ -395,9 +497,11 @@ define <2 x float> @vfwadd_wv_v2f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = load <2 x half>, ptr %y @@ -410,9 +514,11 @@ define <4 x float> @vfwadd_wv_v4f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x half>, ptr %y @@ -425,9 +531,11 @@ define <8 x float> @vfwadd_wv_v8f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v12, (a1) ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = load <8 x half>, ptr %y @@ -440,9 +548,11 @@ define <16 x float> @vfwadd_wv_v16f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v12, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = load <16 x half>, ptr %y @@ -456,9 +566,11 @@ define <32 x float> @vfwadd_wv_v32f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v16, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = load <32 x half>, ptr %y @@ -471,9 +583,11 @@ define <2 x double> @vfwadd_wv_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v9 +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = load <2 x float>, ptr %y @@ -486,9 +600,11 @@ define <4 x double> @vfwadd_wv_v4f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v12, (a1) ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x double>, ptr %x %b = load <4 x float>, ptr %y @@ -501,9 +617,11 @@ define <8 x double> @vfwadd_wv_v8f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v12, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x double>, ptr %x %b = load <8 x float>, ptr %y @@ -516,9 +634,11 @@ define <16 x double> @vfwadd_wv_v16f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwadd_wv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: vfwadd.wv v8, v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x double>, ptr %x %b = load <16 x float>, ptr %y @@ -532,7 +652,10 @@ define <2 x float> @vfwadd_wf_v2f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = insertelement <2 x half> poison, half %y, i32 0 @@ -547,7 +670,10 @@ define <4 x float> @vfwadd_wf_v4f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = insertelement <4 x half> poison, half %y, i32 0 @@ -562,7 +688,10 @@ define <8 x float> @vfwadd_wf_v8f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 @@ -577,7 +706,10 @@ define <16 x float> @vfwadd_wf_v16f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = insertelement <16 x half> poison, half %y, i32 0 @@ -592,7 +724,10 @@ define <2 x double> @vfwadd_wf_v2f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = insertelement <2 x float> poison, float %y, i32 0 @@ -607,7 +742,10 @@ define <4 x double> @vfwadd_wf_v4f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x double>, ptr %x %b = insertelement <4 x float> poison, float %y, i32 0 @@ -622,7 +760,10 @@ define <8 x double> @vfwadd_wf_v8f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x double>, ptr %x %b = insertelement <8 x float> poison, float %y, i32 0 @@ -637,7 +778,10 @@ define <16 x double> @vfwadd_wf_v16f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwadd.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x double>, ptr %x %b = insertelement <16 x float> poison, float %y, i32 0 @@ -651,8 +795,8 @@ define <2 x float> @vfwadd_vf2_v2f32(<2 x half> %x, half %y) { ; CHECK-LABEL: vfwadd_vf2_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwadd.vf v9, v8, fa0 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 +; CHECK-NEXT: vfwadd.wf v8, v9, fa0 ; CHECK-NEXT: ret %a = fpext <2 x half> %x to <2 x float> %b = fpext half %y to float diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll index a9e9b757f372e..03079f5067c98 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmacc.ll @@ -10,7 +10,10 @@ define <1 x float> @vfwmacc_vv_v1f32(<1 x float> %va, <1 x half> %vb, <1 x half> ; CHECK-LABEL: vfwmacc_vv_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x float> %ve = fpext <1 x half> %vc to <1 x float> @@ -22,7 +25,11 @@ define <1 x float> @vfwmacc_vf_v1f32(<1 x float> %va, <1 x half> %vb, half %c) { ; CHECK-LABEL: vfwmacc_vf_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -36,7 +43,12 @@ define <1 x float> @vfwnmacc_vv_v1f32(<1 x float> %va, <1 x half> %vb, <1 x half ; CHECK-LABEL: vfwnmacc_vv_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x float> %ve = fpext <1 x half> %vc to <1 x float> @@ -50,7 +62,15 @@ define <1 x float> @vfwnmacc_vf_v1f32(<1 x float> %va, <1 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_vf_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v12, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -66,7 +86,13 @@ define <1 x float> @vfwnmacc_fv_v1f32(<1 x float> %va, <1 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_fv_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmadd.vv v8, v11, v10 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -82,7 +108,11 @@ define <1 x float> @vfwmsac_vv_v1f32(<1 x float> %va, <1 x half> %vb, <1 x half> ; CHECK-LABEL: vfwmsac_vv_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x float> %ve = fpext <1 x half> %vc to <1 x float> @@ -95,7 +125,12 @@ define <1 x float> @vfwmsac_vf_v1f32(<1 x float> %va, <1 x half> %vb, half %c) { ; CHECK-LABEL: vfwmsac_vf_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -110,7 +145,11 @@ define <1 x float> @vfwnmsac_vv_v1f32(<1 x float> %va, <1 x half> %vb, <1 x half ; CHECK-LABEL: vfwnmsac_vv_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x float> %ve = fpext <1 x half> %vc to <1 x float> @@ -123,7 +162,12 @@ define <1 x float> @vfwnmsac_vf_v1f32(<1 x float> %va, <1 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_vf_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -138,7 +182,12 @@ define <1 x float> @vfwnmsac_fv_v1f32(<1 x float> %va, <1 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_fv_v1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -155,7 +204,10 @@ define <2 x float> @vfwmacc_vv_v2f32(<2 x float> %va, <2 x half> %vb, <2 x half> ; CHECK-LABEL: vfwmacc_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x float> %ve = fpext <2 x half> %vc to <2 x float> @@ -167,7 +219,11 @@ define <2 x float> @vfwmacc_vf_v2f32(<2 x float> %va, <2 x half> %vb, half %c) { ; CHECK-LABEL: vfwmacc_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -181,7 +237,12 @@ define <2 x float> @vfwnmacc_vv_v2f32(<2 x float> %va, <2 x half> %vb, <2 x half ; CHECK-LABEL: vfwnmacc_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x float> %ve = fpext <2 x half> %vc to <2 x float> @@ -195,7 +256,15 @@ define <2 x float> @vfwnmacc_vf_v2f32(<2 x float> %va, <2 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v12, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -211,7 +280,13 @@ define <2 x float> @vfwnmacc_fv_v2f32(<2 x float> %va, <2 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_fv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmadd.vv v8, v11, v10 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -227,7 +302,11 @@ define <2 x float> @vfwmsac_vv_v2f32(<2 x float> %va, <2 x half> %vb, <2 x half> ; CHECK-LABEL: vfwmsac_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x float> %ve = fpext <2 x half> %vc to <2 x float> @@ -240,7 +319,12 @@ define <2 x float> @vfwmsac_vf_v2f32(<2 x float> %va, <2 x half> %vb, half %c) { ; CHECK-LABEL: vfwmsac_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -255,7 +339,11 @@ define <2 x float> @vfwnmsac_vv_v2f32(<2 x float> %va, <2 x half> %vb, <2 x half ; CHECK-LABEL: vfwnmsac_vv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x float> %ve = fpext <2 x half> %vc to <2 x float> @@ -268,7 +356,12 @@ define <2 x float> @vfwnmsac_vf_v2f32(<2 x float> %va, <2 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -283,7 +376,12 @@ define <2 x float> @vfwnmsac_fv_v2f32(<2 x float> %va, <2 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_fv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -301,7 +399,10 @@ define <4 x float> @vfwmacc_vv_v4f32(<4 x float> %va, <4 x half> %vb, <4 x half> ; CHECK-LABEL: vfwmacc_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x float> %ve = fpext <4 x half> %vc to <4 x float> @@ -313,7 +414,11 @@ define <4 x float> @vfwmacc_vf_v4f32(<4 x float> %va, <4 x half> %vb, half %c) { ; CHECK-LABEL: vfwmacc_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -327,7 +432,12 @@ define <4 x float> @vfwnmacc_vv_v4f32(<4 x float> %va, <4 x half> %vb, <4 x half ; CHECK-LABEL: vfwnmacc_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x float> %ve = fpext <4 x half> %vc to <4 x float> @@ -341,7 +451,15 @@ define <4 x float> @vfwnmacc_vf_v4f32(<4 x float> %va, <4 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v12, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -357,7 +475,13 @@ define <4 x float> @vfwnmacc_fv_v4f32(<4 x float> %va, <4 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_fv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmadd.vv v8, v11, v10 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -373,7 +497,11 @@ define <4 x float> @vfwmsac_vv_v4f32(<4 x float> %va, <4 x half> %vb, <4 x half> ; CHECK-LABEL: vfwmsac_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x float> %ve = fpext <4 x half> %vc to <4 x float> @@ -386,7 +514,12 @@ define <4 x float> @vfwmsac_vf_v4f32(<4 x float> %va, <4 x half> %vb, half %c) { ; CHECK-LABEL: vfwmsac_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -401,7 +534,11 @@ define <4 x float> @vfwnmsac_vv_v4f32(<4 x float> %va, <4 x half> %vb, <4 x half ; CHECK-LABEL: vfwnmsac_vv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x float> %ve = fpext <4 x half> %vc to <4 x float> @@ -414,7 +551,12 @@ define <4 x float> @vfwnmsac_vf_v4f32(<4 x float> %va, <4 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_vf_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -429,7 +571,12 @@ define <4 x float> @vfwnmsac_fv_v4f32(<4 x float> %va, <4 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_fv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -446,7 +593,10 @@ define <8 x float> @vfwmacc_vv_v8f32(<8 x float> %va, <8 x half> %vb, <8 x half> ; CHECK-LABEL: vfwmacc_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v12, v14 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x float> %ve = fpext <8 x half> %vc to <8 x float> @@ -458,7 +608,11 @@ define <8 x float> @vfwmacc_vf_v8f32(<8 x float> %va, <8 x half> %vb, half %c) { ; CHECK-LABEL: vfwmacc_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -472,7 +626,12 @@ define <8 x float> @vfwnmacc_vv_v8f32(<8 x float> %va, <8 x half> %vb, <8 x half ; CHECK-LABEL: vfwnmacc_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v14, v10 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x float> %ve = fpext <8 x half> %vc to <8 x float> @@ -486,7 +645,15 @@ define <8 x float> @vfwnmacc_vf_v8f32(<8 x float> %va, <8 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v14, v10 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -502,7 +669,13 @@ define <8 x float> @vfwnmacc_fv_v8f32(<8 x float> %va, <8 x half> %vb, half %c) ; CHECK-LABEL: vfwnmacc_fv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -518,7 +691,11 @@ define <8 x float> @vfwmsac_vv_v8f32(<8 x float> %va, <8 x half> %vb, <8 x half> ; CHECK-LABEL: vfwmsac_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v14 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x float> %ve = fpext <8 x half> %vc to <8 x float> @@ -531,7 +708,12 @@ define <8 x float> @vfwmsac_vf_v8f32(<8 x float> %va, <8 x half> %vb, half %c) { ; CHECK-LABEL: vfwmsac_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -546,7 +728,11 @@ define <8 x float> @vfwnmsac_vv_v8f32(<8 x float> %va, <8 x half> %vb, <8 x half ; CHECK-LABEL: vfwnmsac_vv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v12 +; CHECK-NEXT: vfmacc.vv v8, v14, v10 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x float> %ve = fpext <8 x half> %vc to <8 x float> @@ -559,7 +745,12 @@ define <8 x float> @vfwnmsac_vf_v8f32(<8 x float> %va, <8 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_vf_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -574,7 +765,12 @@ define <8 x float> @vfwnmsac_fv_v8f32(<8 x float> %va, <8 x half> %vb, half %c) ; CHECK-LABEL: vfwnmsac_fv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -591,7 +787,10 @@ define <16 x float> @vfwmacc_vv_v16f32(<16 x float> %va, <16 x half> %vb, <16 x ; CHECK-LABEL: vfwmacc_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v16, v20 ; CHECK-NEXT: ret %vd = fpext <16 x half> %vb to <16 x float> %ve = fpext <16 x half> %vc to <16 x float> @@ -603,7 +802,11 @@ define <16 x float> @vfwmacc_vf_v16f32(<16 x float> %va, <16 x half> %vb, half % ; CHECK-LABEL: vfwmacc_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -617,7 +820,12 @@ define <16 x float> @vfwnmacc_vv_v16f32(<16 x float> %va, <16 x half> %vb, <16 x ; CHECK-LABEL: vfwnmacc_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v8 +; CHECK-NEXT: vfneg.v v8, v16 +; CHECK-NEXT: vfmadd.vv v8, v20, v12 ; CHECK-NEXT: ret %vd = fpext <16 x half> %vb to <16 x float> %ve = fpext <16 x half> %vc to <16 x float> @@ -631,7 +839,15 @@ define <16 x float> @vfwnmacc_vf_v16f32(<16 x float> %va, <16 x half> %vb, half ; CHECK-LABEL: vfwnmacc_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v16 +; CHECK-NEXT: vfmadd.vv v8, v20, v12 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -647,7 +863,13 @@ define <16 x float> @vfwnmacc_fv_v16f32(<16 x float> %va, <16 x half> %vb, half ; CHECK-LABEL: vfwnmacc_fv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v16, v20 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -663,7 +885,11 @@ define <16 x float> @vfwmsac_vv_v16f32(<16 x float> %va, <16 x half> %vb, <16 x ; CHECK-LABEL: vfwmsac_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v20 ; CHECK-NEXT: ret %vd = fpext <16 x half> %vb to <16 x float> %ve = fpext <16 x half> %vc to <16 x float> @@ -676,7 +902,12 @@ define <16 x float> @vfwmsac_vf_v16f32(<16 x float> %va, <16 x half> %vb, half % ; CHECK-LABEL: vfwmsac_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -691,7 +922,11 @@ define <16 x float> @vfwnmsac_vv_v16f32(<16 x float> %va, <16 x half> %vb, <16 x ; CHECK-LABEL: vfwnmsac_vv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v16 +; CHECK-NEXT: vfmacc.vv v8, v20, v12 ; CHECK-NEXT: ret %vd = fpext <16 x half> %vb to <16 x float> %ve = fpext <16 x half> %vc to <16 x float> @@ -704,7 +939,12 @@ define <16 x float> @vfwnmsac_vf_v16f32(<16 x float> %va, <16 x half> %vb, half ; CHECK-LABEL: vfwnmsac_vf_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -719,7 +959,12 @@ define <16 x float> @vfwnmsac_fv_v16f32(<16 x float> %va, <16 x half> %vb, half ; CHECK-LABEL: vfwnmsac_fv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <16 x half> poison, half %c, i32 0 %splat = shufflevector <16 x half> %head, <16 x half> poison, <16 x i32> zeroinitializer @@ -736,7 +981,10 @@ define <1 x double> @vfwmacc_vv_v1f64(<1 x double> %va, <1 x float> %vb, <1 x fl ; CHECK-LABEL: vfwmacc_vv_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <1 x float> %vb to <1 x double> %ve = fpext <1 x float> %vc to <1 x double> @@ -748,7 +996,11 @@ define <1 x double> @vfwmacc_vf_v1f64(<1 x double> %va, <1 x float> %vb, float % ; CHECK-LABEL: vfwmacc_vf_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %c, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -762,7 +1014,12 @@ define <1 x double> @vfwnmacc_vv_v1f64(<1 x double> %va, <1 x float> %vb, <1 x f ; CHECK-LABEL: vfwnmacc_vv_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <1 x float> %vb to <1 x double> %ve = fpext <1 x float> %vc to <1 x double> @@ -776,7 +1033,15 @@ define <1 x double> @vfwnmacc_vf_v1f64(<1 x double> %va, <1 x float> %vb, float ; CHECK-LABEL: vfwnmacc_vf_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v12, v9 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %c, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -792,7 +1057,13 @@ define <1 x double> @vfwnmacc_fv_v1f64(<1 x double> %va, <1 x float> %vb, float ; CHECK-LABEL: vfwnmacc_fv_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmadd.vv v8, v11, v10 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %c, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -808,7 +1079,11 @@ define <1 x double> @vfwmsac_vv_v1f64(<1 x double> %va, <1 x float> %vb, <1 x fl ; CHECK-LABEL: vfwmsac_vv_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <1 x float> %vb to <1 x double> %ve = fpext <1 x float> %vc to <1 x double> @@ -821,7 +1096,12 @@ define <1 x double> @vfwmsac_vf_v1f64(<1 x double> %va, <1 x float> %vb, float % ; CHECK-LABEL: vfwmsac_vf_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %c, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -836,7 +1116,11 @@ define <1 x double> @vfwnmsac_vv_v1f64(<1 x double> %va, <1 x float> %vb, <1 x f ; CHECK-LABEL: vfwnmsac_vv_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <1 x float> %vb to <1 x double> %ve = fpext <1 x float> %vc to <1 x double> @@ -849,7 +1133,12 @@ define <1 x double> @vfwnmsac_vf_v1f64(<1 x double> %va, <1 x float> %vb, float ; CHECK-LABEL: vfwnmsac_vf_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %c, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -864,7 +1153,12 @@ define <1 x double> @vfwnmsac_fv_v1f64(<1 x double> %va, <1 x float> %vb, float ; CHECK-LABEL: vfwnmsac_fv_v1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.s.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <1 x float> poison, float %c, i32 0 %splat = shufflevector <1 x float> %head, <1 x float> poison, <1 x i32> zeroinitializer @@ -881,7 +1175,10 @@ define <2 x double> @vfwmacc_vv_v2f64(<2 x double> %va, <2 x float> %vb, <2 x fl ; CHECK-LABEL: vfwmacc_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <2 x float> %vb to <2 x double> %ve = fpext <2 x float> %vc to <2 x double> @@ -893,7 +1190,11 @@ define <2 x double> @vfwmacc_vf_v2f64(<2 x double> %va, <2 x float> %vb, float % ; CHECK-LABEL: vfwmacc_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -907,7 +1208,12 @@ define <2 x double> @vfwnmacc_vv_v2f64(<2 x double> %va, <2 x float> %vb, <2 x f ; CHECK-LABEL: vfwnmacc_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <2 x float> %vb to <2 x double> %ve = fpext <2 x float> %vc to <2 x double> @@ -921,7 +1227,15 @@ define <2 x double> @vfwnmacc_vf_v2f64(<2 x double> %va, <2 x float> %vb, float ; CHECK-LABEL: vfwnmacc_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v11 +; CHECK-NEXT: vfmadd.vv v8, v12, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -937,7 +1251,13 @@ define <2 x double> @vfwnmacc_fv_v2f64(<2 x double> %va, <2 x float> %vb, float ; CHECK-LABEL: vfwnmacc_fv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v9 +; CHECK-NEXT: vfmadd.vv v8, v11, v10 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -953,7 +1273,11 @@ define <2 x double> @vfwmsac_vv_v2f64(<2 x double> %va, <2 x float> %vb, <2 x fl ; CHECK-LABEL: vfwmsac_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <2 x float> %vb to <2 x double> %ve = fpext <2 x float> %vc to <2 x double> @@ -966,7 +1290,12 @@ define <2 x double> @vfwmsac_vf_v2f64(<2 x double> %va, <2 x float> %vb, float % ; CHECK-LABEL: vfwmsac_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -981,7 +1310,11 @@ define <2 x double> @vfwnmsac_vv_v2f64(<2 x double> %va, <2 x float> %vb, <2 x f ; CHECK-LABEL: vfwnmsac_vv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v9, v10 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %vd = fpext <2 x float> %vb to <2 x double> %ve = fpext <2 x float> %vc to <2 x double> @@ -994,7 +1327,12 @@ define <2 x double> @vfwnmsac_vf_v2f64(<2 x double> %va, <2 x float> %vb, float ; CHECK-LABEL: vfwnmsac_vf_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v11 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -1009,7 +1347,12 @@ define <2 x double> @vfwnmsac_fv_v2f64(<2 x double> %va, <2 x float> %vb, float ; CHECK-LABEL: vfwnmsac_fv_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfmv.v.f v10, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %head = insertelement <2 x float> poison, float %c, i32 0 %splat = shufflevector <2 x float> %head, <2 x float> poison, <2 x i32> zeroinitializer @@ -1027,7 +1370,10 @@ define <4 x double> @vfwmacc_vv_v4f64(<4 x double> %va, <4 x float> %vb, <4 x fl ; CHECK-LABEL: vfwmacc_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v12, v14 ; CHECK-NEXT: ret %vd = fpext <4 x float> %vb to <4 x double> %ve = fpext <4 x float> %vc to <4 x double> @@ -1039,7 +1385,11 @@ define <4 x double> @vfwmacc_vf_v4f64(<4 x double> %va, <4 x float> %vb, float % ; CHECK-LABEL: vfwmacc_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -1053,7 +1403,12 @@ define <4 x double> @vfwnmacc_vv_v4f64(<4 x double> %va, <4 x float> %vb, <4 x f ; CHECK-LABEL: vfwnmacc_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v14, v10 ; CHECK-NEXT: ret %vd = fpext <4 x float> %vb to <4 x double> %ve = fpext <4 x float> %vc to <4 x double> @@ -1067,7 +1422,15 @@ define <4 x double> @vfwnmacc_vf_v4f64(<4 x double> %va, <4 x float> %vb, float ; CHECK-LABEL: vfwnmacc_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v14, v10 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -1083,7 +1446,13 @@ define <4 x double> @vfwnmacc_fv_v4f64(<4 x double> %va, <4 x float> %vb, float ; CHECK-LABEL: vfwnmacc_fv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -1099,7 +1468,11 @@ define <4 x double> @vfwmsac_vv_v4f64(<4 x double> %va, <4 x float> %vb, <4 x fl ; CHECK-LABEL: vfwmsac_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v14 ; CHECK-NEXT: ret %vd = fpext <4 x float> %vb to <4 x double> %ve = fpext <4 x float> %vc to <4 x double> @@ -1112,7 +1485,12 @@ define <4 x double> @vfwmsac_vf_v4f64(<4 x double> %va, <4 x float> %vb, float % ; CHECK-LABEL: vfwmsac_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -1127,7 +1505,11 @@ define <4 x double> @vfwnmsac_vv_v4f64(<4 x double> %va, <4 x float> %vb, <4 x f ; CHECK-LABEL: vfwnmsac_vv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v12 +; CHECK-NEXT: vfmacc.vv v8, v14, v10 ; CHECK-NEXT: ret %vd = fpext <4 x float> %vb to <4 x double> %ve = fpext <4 x float> %vc to <4 x double> @@ -1140,7 +1522,12 @@ define <4 x double> @vfwnmsac_vf_v4f64(<4 x double> %va, <4 x float> %vb, float ; CHECK-LABEL: vfwnmsac_vf_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -1155,7 +1542,12 @@ define <4 x double> @vfwnmsac_fv_v4f64(<4 x double> %va, <4 x float> %vb, float ; CHECK-LABEL: vfwnmsac_fv_v4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v10 +; CHECK-NEXT: vfmv.v.f v14, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x float> poison, float %c, i32 0 %splat = shufflevector <4 x float> %head, <4 x float> poison, <4 x i32> zeroinitializer @@ -1172,7 +1564,10 @@ define <8 x double> @vfwmacc_vv_v8f64(<8 x double> %va, <8 x float> %vb, <8 x fl ; CHECK-LABEL: vfwmacc_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v16, v20 ; CHECK-NEXT: ret %vd = fpext <8 x float> %vb to <8 x double> %ve = fpext <8 x float> %vc to <8 x double> @@ -1184,7 +1579,11 @@ define <8 x double> @vfwmacc_vf_v8f64(<8 x double> %va, <8 x float> %vb, float % ; CHECK-LABEL: vfwmacc_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1198,7 +1597,12 @@ define <8 x double> @vfwnmacc_vv_v8f64(<8 x double> %va, <8 x float> %vb, <8 x f ; CHECK-LABEL: vfwnmacc_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v8 +; CHECK-NEXT: vfneg.v v8, v16 +; CHECK-NEXT: vfmadd.vv v8, v20, v12 ; CHECK-NEXT: ret %vd = fpext <8 x float> %vb to <8 x double> %ve = fpext <8 x float> %vc to <8 x double> @@ -1212,7 +1616,15 @@ define <8 x double> @vfwnmacc_vf_v8f64(<8 x double> %va, <8 x float> %vb, float ; CHECK-LABEL: vfwnmacc_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v16 +; CHECK-NEXT: vfmadd.vv v8, v20, v12 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1228,7 +1640,13 @@ define <8 x double> @vfwnmacc_fv_v8f64(<8 x double> %va, <8 x float> %vb, float ; CHECK-LABEL: vfwnmacc_fv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v16, v20 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1244,7 +1662,11 @@ define <8 x double> @vfwmsac_vv_v8f64(<8 x double> %va, <8 x float> %vb, <8 x fl ; CHECK-LABEL: vfwmsac_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v20 ; CHECK-NEXT: ret %vd = fpext <8 x float> %vb to <8 x double> %ve = fpext <8 x float> %vc to <8 x double> @@ -1257,7 +1679,12 @@ define <8 x double> @vfwmsac_vf_v8f64(<8 x double> %va, <8 x float> %vb, float % ; CHECK-LABEL: vfwmsac_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1272,7 +1699,11 @@ define <8 x double> @vfwnmsac_vv_v8f64(<8 x double> %va, <8 x float> %vb, <8 x f ; CHECK-LABEL: vfwnmsac_vv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v12, v14 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v16 +; CHECK-NEXT: vfmacc.vv v8, v20, v12 ; CHECK-NEXT: ret %vd = fpext <8 x float> %vb to <8 x double> %ve = fpext <8 x float> %vc to <8 x double> @@ -1285,7 +1716,12 @@ define <8 x double> @vfwnmsac_vf_v8f64(<8 x double> %va, <8 x float> %vb, float ; CHECK-LABEL: vfwnmsac_vf_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1300,7 +1736,12 @@ define <8 x double> @vfwnmsac_fv_v8f64(<8 x double> %va, <8 x float> %vb, float ; CHECK-LABEL: vfwnmsac_fv_v8f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v12 +; CHECK-NEXT: vfmv.v.f v20, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <8 x float> poison, float %c, i32 0 %splat = shufflevector <8 x float> %head, <8 x float> poison, <8 x i32> zeroinitializer @@ -1318,7 +1759,10 @@ define <1 x double> @vfwmacc_vv_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, <1 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v10, v11 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x double> %ve = fpext <1 x half> %vc to <1 x double> @@ -1332,9 +1776,14 @@ define <1 x double> @vfwmacc_vf_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, ha ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -1351,7 +1800,12 @@ define <1 x double> @vfwnmacc_vv_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, < ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x double> %ve = fpext <1 x half> %vc to <1 x double> @@ -1367,9 +1821,17 @@ define <1 x double> @vfwnmacc_vf_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -1387,9 +1849,16 @@ define <1 x double> @vfwnmacc_fv_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v9, v11 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -1408,7 +1877,11 @@ define <1 x double> @vfwmsac_vv_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, <1 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v11 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x double> %ve = fpext <1 x half> %vc to <1 x double> @@ -1423,9 +1896,15 @@ define <1 x double> @vfwmsac_vf_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, ha ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -1443,7 +1922,11 @@ define <1 x double> @vfwnmsac_vv_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, < ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v10 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <1 x half> %vb to <1 x double> %ve = fpext <1 x half> %vc to <1 x double> @@ -1458,9 +1941,15 @@ define <1 x double> @vfwnmsac_vf_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -1477,9 +1966,15 @@ define <1 x double> @vfwnmsac_fv_v1f64_v1f16(<1 x double> %va, <1 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.s.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <1 x half> poison, half %c, i32 0 %splat = shufflevector <1 x half> %head, <1 x half> poison, <1 x i32> zeroinitializer @@ -1497,7 +1992,10 @@ define <2 x double> @vfwmacc_vv_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, <2 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v10, v11 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x double> %ve = fpext <2 x half> %vc to <2 x double> @@ -1511,9 +2009,14 @@ define <2 x double> @vfwmacc_vf_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, ha ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1530,7 +2033,12 @@ define <2 x double> @vfwnmacc_vv_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, < ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x double> %ve = fpext <2 x half> %vc to <2 x double> @@ -1546,9 +2054,17 @@ define <2 x double> @vfwnmacc_vf_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v8, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1566,9 +2082,16 @@ define <2 x double> @vfwnmacc_fv_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v11, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v9, v11 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1587,7 +2110,11 @@ define <2 x double> @vfwmsac_vv_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, <2 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v11 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x double> %ve = fpext <2 x half> %vc to <2 x double> @@ -1602,9 +2129,15 @@ define <2 x double> @vfwmsac_vf_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, ha ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1622,7 +2155,11 @@ define <2 x double> @vfwnmsac_vv_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, < ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 ; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v11, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v10 +; CHECK-NEXT: vfmacc.vv v8, v11, v9 ; CHECK-NEXT: ret %vd = fpext <2 x half> %vb to <2 x double> %ve = fpext <2 x half> %vc to <2 x double> @@ -1637,9 +2174,15 @@ define <2 x double> @vfwnmsac_vf_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v9, v9 +; CHECK-NEXT: vfmacc.vv v8, v10, v9 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1656,9 +2199,15 @@ define <2 x double> @vfwnmsac_fv_v2f64_v2f16(<2 x double> %va, <2 x half> %vb, h ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vfmv.v.f v10, fa0 ; CHECK-NEXT: vfwcvt.f.f.v v11, v9 -; CHECK-NEXT: vfwcvt.f.f.v v9, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v11, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v11, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v9, v10 ; CHECK-NEXT: ret %head = insertelement <2 x half> poison, half %c, i32 0 %splat = shufflevector <2 x half> %head, <2 x half> poison, <2 x i32> zeroinitializer @@ -1675,9 +2224,12 @@ define <4 x double> @vfwmacc_vv_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x double> %ve = fpext <4 x half> %vc to <4 x double> @@ -1689,11 +2241,16 @@ define <4 x double> @vfwmacc_vf_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, ha ; CHECK-LABEL: vfwmacc_vf_v4f64_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v11, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v13, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1708,9 +2265,14 @@ define <4 x double> @vfwnmacc_vv_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, < ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vv v8, v12, v14 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x double> %ve = fpext <4 x half> %vc to <4 x double> @@ -1724,11 +2286,19 @@ define <4 x double> @vfwnmacc_vf_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, h ; CHECK-LABEL: vfwnmacc_vf_v4f64_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v11, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v13, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v8, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1744,11 +2314,18 @@ define <4 x double> @vfwnmacc_fv_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, h ; CHECK-LABEL: vfwnmacc_fv_v4f64_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v11, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v13, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v14, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v10, v14 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1765,9 +2342,13 @@ define <4 x double> @vfwmsac_vv_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, <4 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x double> %ve = fpext <4 x half> %vc to <4 x double> @@ -1780,11 +2361,17 @@ define <4 x double> @vfwmsac_vf_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, ha ; CHECK-LABEL: vfwmsac_vf_v4f64_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v11, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v13, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1800,9 +2387,13 @@ define <4 x double> @vfwnmsac_vv_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, < ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfwcvt.f.f.v v14, v11 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %vd = fpext <4 x half> %vb to <4 x double> %ve = fpext <4 x half> %vc to <4 x double> @@ -1815,11 +2406,17 @@ define <4 x double> @vfwnmsac_vf_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, h ; CHECK-LABEL: vfwnmsac_vf_v4f64_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v11, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v13, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v10, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v10, v10 +; CHECK-NEXT: vfmacc.vv v8, v12, v10 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1834,11 +2431,17 @@ define <4 x double> @vfwnmsac_fv_v4f64_v4f16(<4 x double> %va, <4 x half> %vb, h ; CHECK-LABEL: vfwnmsac_fv_v4f64_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfmv.v.f v11, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v12, v10 -; CHECK-NEXT: vfwcvt.f.f.v v10, v11 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v13, v10 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v12, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v13 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v12, v14 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v10, v12 ; CHECK-NEXT: ret %head = insertelement <4 x half> poison, half %c, i32 0 %splat = shufflevector <4 x half> %head, <4 x half> poison, <4 x i32> zeroinitializer @@ -1853,10 +2456,13 @@ define <8 x double> @vfwmacc_vv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, <8 ; CHECK-LABEL: vfwmacc_vv_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 -; CHECK-NEXT: vfwcvt.f.f.v v16, v13 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v13 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v14, v16 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x double> %ve = fpext <8 x half> %vc to <8 x double> @@ -1868,11 +2474,16 @@ define <8 x double> @vfwmacc_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, ha ; CHECK-LABEL: vfwmacc_vf_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vfmv.v.f v18, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwmacc.vv v8, v14, v12 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1886,10 +2497,15 @@ define <8 x double> @vfwnmacc_vv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, < ; CHECK-LABEL: vfwnmacc_vv_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 -; CHECK-NEXT: vfwcvt.f.f.v v16, v13 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v13 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v14, v16 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v8 +; CHECK-NEXT: vfneg.v v8, v12 +; CHECK-NEXT: vfmadd.vv v8, v16, v20 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x double> %ve = fpext <8 x half> %vc to <8 x double> @@ -1903,11 +2519,19 @@ define <8 x double> @vfwnmacc_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h ; CHECK-LABEL: vfwnmacc_vf_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 -; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vfmv.v.f v18, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v14, v12 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v18 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmadd.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1923,11 +2547,18 @@ define <8 x double> @vfwnmacc_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h ; CHECK-LABEL: vfwnmacc_fv_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vfmv.v.f v18, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwnmacc.vv v8, v14, v12 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v20, v8 +; CHECK-NEXT: vfneg.v v8, v16 +; CHECK-NEXT: vfmadd.vv v8, v12, v20 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1943,10 +2574,14 @@ define <8 x double> @vfwmsac_vv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, <8 ; CHECK-LABEL: vfwmsac_vv_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 -; CHECK-NEXT: vfwcvt.f.f.v v16, v13 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v13 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v14, v16 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x double> %ve = fpext <8 x half> %vc to <8 x double> @@ -1959,11 +2594,17 @@ define <8 x double> @vfwmsac_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, ha ; CHECK-LABEL: vfwmsac_vf_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vfmv.v.f v18, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwmsac.vv v8, v14, v12 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -1978,10 +2619,14 @@ define <8 x double> @vfwnmsac_vv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, < ; CHECK-LABEL: vfwnmsac_vv_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 -; CHECK-NEXT: vfwcvt.f.f.v v16, v13 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vfwcvt.f.f.v v20, v13 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v14, v16 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %vd = fpext <8 x half> %vb to <8 x double> %ve = fpext <8 x half> %vc to <8 x double> @@ -1994,11 +2639,17 @@ define <8 x double> @vfwnmsac_vf_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h ; CHECK-LABEL: vfwnmsac_vf_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vfmv.v.f v18, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v14, v12 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v12, v12 +; CHECK-NEXT: vfmacc.vv v8, v16, v12 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2013,11 +2664,17 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h ; CHECK-LABEL: vfwnmsac_fv_v8f64_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v16, fa0 -; CHECK-NEXT: vfwcvt.f.f.v v14, v12 +; CHECK-NEXT: vfmv.v.f v18, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v20, v18 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vfwnmsac.vv v8, v14, v12 +; CHECK-NEXT: vfwcvt.f.f.v v16, v20 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfneg.v v16, v16 +; CHECK-NEXT: vfmacc.vv v8, v12, v16 ; CHECK-NEXT: ret %head = insertelement <8 x half> poison, half %c, i32 0 %splat = shufflevector <8 x half> %head, <8 x half> poison, <8 x i32> zeroinitializer @@ -2031,8 +2688,11 @@ define <8 x double> @vfwnmsac_fv_v8f64_v8f16(<8 x double> %va, <8 x half> %vb, h define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) { ; CHECK-LABEL: vfwmacc_vf2_v2f32: ; CHECK: # %bb.0: +; CHECK-NEXT: fcvt.s.h fa5, fa0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmacc.vf v8, fa5, v10 ; CHECK-NEXT: ret %cext = fpext half %c to float %head = insertelement <2 x float> poison, float %cext, i32 0 @@ -2045,8 +2705,12 @@ define <2 x float> @vfwmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) { ; CHECK-LABEL: vfwmsac_vf2_v2f32: ; CHECK: # %bb.0: +; CHECK-NEXT: fcvt.s.h fa5, fa0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v8, v8 +; CHECK-NEXT: vfmacc.vf v8, fa5, v10 ; CHECK-NEXT: ret %cext = fpext half %c to float %head = insertelement <2 x float> poison, float %cext, i32 0 @@ -2060,8 +2724,13 @@ define <2 x float> @vfwmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) { ; CHECK-LABEL: vfwnmacc_vf2_v2f32: ; CHECK: # %bb.0: +; CHECK-NEXT: fcvt.s.h fa5, fa0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmacc.vf v8, fa0, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v9, v8 +; CHECK-NEXT: vfneg.v v8, v10 +; CHECK-NEXT: vfmadd.vf v8, fa5, v9 ; CHECK-NEXT: ret %cext = fpext half %c to float %head = insertelement <2 x float> poison, float %cext, i32 0 @@ -2076,8 +2745,12 @@ define <2 x float> @vfwnmacc_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) define <2 x float> @vfwnmsac_vf2_v2f32(<2 x float> %va, <2 x half> %vb, half %c) { ; CHECK-LABEL: vfwnmsac_vf2_v2f32: ; CHECK: # %bb.0: +; CHECK-NEXT: fcvt.s.h fa5, fa0 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwnmsac.vf v8, fa0, v9 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfneg.v v9, v10 +; CHECK-NEXT: vfmacc.vf v8, fa5, v9 ; CHECK-NEXT: ret %cext = fpext half %c to float %head = insertelement <2 x float> poison, float %cext, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll index aec970adff51e..521de12a5afa4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll @@ -8,7 +8,10 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v1f32: ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: vsetivli zero, 1, e16, mf4, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v9, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v11, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v9, v10 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v11, v9 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v1f32: @@ -31,7 +34,10 @@ define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> ; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 ; ZVFBFWMA-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFBFWMA-NEXT: vmv.s.x v10, a0 -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v11, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v11, v10 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v1f32: @@ -56,7 +62,10 @@ define <2 x float> @vfwmaccbf16_vv_v2f32(<2 x float> %a, <2 x bfloat> %b, <2 x b ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v2f32: ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v9, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v11, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v9, v10 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v11, v9 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v2f32: @@ -79,7 +88,10 @@ define <2 x float> @vfwmaccbf16_vf_v2f32(<2 x float> %a, bfloat %b, <2 x bfloat> ; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 ; ZVFBFWMA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFBFWMA-NEXT: vmv.v.x v10, a0 -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v11, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v11, v10 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v2f32: @@ -104,7 +116,10 @@ define <4 x float> @vfwmaccbf16_vv_v4f32(<4 x float> %a, <4 x bfloat> %b, <4 x b ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v4f32: ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v9, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v11, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v9, v10 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v11, v9 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v4f32: @@ -127,7 +142,10 @@ define <4 x float> @vfwmaccbf16_vf_v4f32(<4 x float> %a, bfloat %b, <4 x bfloat> ; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 ; ZVFBFWMA-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFBFWMA-NEXT: vmv.v.x v10, a0 -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v11, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v11, v10 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v4f32: @@ -152,7 +170,10 @@ define <8 x float> @vfwmaccbf16_vv_v8f32(<8 x float> %a, <8 x bfloat> %b, <8 x b ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v8f32: ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v11 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v12, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v14, v11 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v12, v14 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v8f32: @@ -175,7 +196,10 @@ define <8 x float> @vfwmaccbf16_vf_v8f32(<8 x float> %a, bfloat %b, <8 x bfloat> ; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 ; ZVFBFWMA-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFBFWMA-NEXT: vmv.v.x v11, a0 -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v11, v10 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v12, v11 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v14, v10 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v12, v14 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v8f32: @@ -200,7 +224,10 @@ define <16 x float> @vfwmaccbf16_vv_v16f32(<16 x float> %a, <16 x bfloat> %b, <1 ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v16f32: ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v12, v14 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v20, v14 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v16, v20 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v16f32: @@ -223,7 +250,10 @@ define <16 x float> @vfwmaccbf16_vf_v16f32(<16 x float> %a, bfloat %b, <16 x bfl ; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 ; ZVFBFWMA-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFBFWMA-NEXT: vmv.v.x v14, a0 -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v14, v12 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v16, v14 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v20, v12 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v16, v20 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v16f32: @@ -249,7 +279,10 @@ define <32 x float> @vfwmaccbf32_vv_v32f32(<32 x float> %a, <32 x bfloat> %b, <3 ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: li a0, 32 ; ZVFBFWMA-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v16, v20 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v0, v20 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v24, v0 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf32_vv_v32f32: @@ -274,7 +307,10 @@ define <32 x float> @vfwmaccbf32_vf_v32f32(<32 x float> %a, bfloat %b, <32 x bfl ; ZVFBFWMA-NEXT: li a1, 32 ; ZVFBFWMA-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; ZVFBFWMA-NEXT: vmv.v.x v20, a0 -; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v20, v16 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v0, v16 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vv v8, v24, v0 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf32_vf_v32f32: @@ -300,7 +336,10 @@ define <4 x float> @vfwmaccbf16_vf_v4f32_scalar_extend(<4 x float> %rd, bfloat % ; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v4f32_scalar_extend: ; ZVFBFWMA: # %bb.0: ; ZVFBFWMA-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVFBFWMA-NEXT: vfwmaccbf16.vf v8, fa0, v9 +; ZVFBFWMA-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa0 +; ZVFBFWMA-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFBFWMA-NEXT: vfmacc.vf v8, fa5, v10 ; ZVFBFWMA-NEXT: ret ; ; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v4f32_scalar_extend: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll index 47ac1c1a88df4..59c47b60236ab 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll @@ -8,9 +8,12 @@ define <2 x float> @vfwmul_v2f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwmul_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwmul.vv v8, v9, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = load <2 x half>, ptr %y @@ -24,9 +27,12 @@ define <4 x float> @vfwmul_v4f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwmul_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwmul.vv v8, v9, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b = load <4 x half>, ptr %y @@ -41,8 +47,11 @@ define <8 x float> @vfwmul_v8f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vfwmul.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -57,8 +66,11 @@ define <16 x float> @vfwmul_v16f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vfwmul.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x half>, ptr %x %b = load <16 x half>, ptr %y @@ -74,8 +86,11 @@ define <32 x float> @vfwmul_v32f16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vfwmul.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x half>, ptr %x %b = load <32 x half>, ptr %y @@ -97,18 +112,27 @@ define <64 x float> @vfwmul_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwmul.vv v8, v16, v24 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwmul.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v0 +; CHECK-NEXT: vfmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -128,9 +152,12 @@ define <2 x double> @vfwmul_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwmul_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vfwmul.vv v8, v9, v10 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = load <2 x float>, ptr %y @@ -145,8 +172,11 @@ define <4 x double> @vfwmul_v4f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vfwmul.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -161,8 +191,11 @@ define <8 x double> @vfwmul_v8f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vfwmul.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = load <8 x float>, ptr %y @@ -177,8 +210,11 @@ define <16 x double> @vfwmul_v16f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vfwmul.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = load <16 x float>, ptr %y @@ -200,16 +236,25 @@ define <32 x double> @vfwmul_v32f32(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwmul.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwmul.vv v16, v24, v0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v0 +; CHECK-NEXT: vfmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -229,8 +274,12 @@ define <2 x float> @vfwmul_vf_v2f16(ptr %x, half %y) { ; CHECK-LABEL: vfwmul_vf_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwmul.vf v8, v9, fa0 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = insertelement <2 x half> poison, half %y, i32 0 @@ -245,8 +294,12 @@ define <4 x float> @vfwmul_vf_v4f16(ptr %x, half %y) { ; CHECK-LABEL: vfwmul_vf_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwmul.vf v8, v9, fa0 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b = insertelement <4 x half> poison, half %y, i32 0 @@ -262,7 +315,11 @@ define <8 x float> @vfwmul_vf_v8f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwmul.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 @@ -278,7 +335,11 @@ define <16 x float> @vfwmul_vf_v16f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vfwmul.vf v8, v12, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x half>, ptr %x %b = insertelement <16 x half> poison, half %y, i32 0 @@ -295,7 +356,11 @@ define <32 x float> @vfwmul_vf_v32f16(ptr %x, half %y) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vfwmul.vf v8, v16, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x half>, ptr %x %b = insertelement <32 x half> poison, half %y, i32 0 @@ -310,8 +375,12 @@ define <2 x double> @vfwmul_vf_v2f32(ptr %x, float %y) { ; CHECK-LABEL: vfwmul_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vfwmul.vf v8, v9, fa0 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = insertelement <2 x float> poison, float %y, i32 0 @@ -327,7 +396,11 @@ define <4 x double> @vfwmul_vf_v4f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vfwmul.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = insertelement <4 x float> poison, float %y, i32 0 @@ -343,7 +416,11 @@ define <8 x double> @vfwmul_vf_v8f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vfwmul.vf v8, v12, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = insertelement <8 x float> poison, float %y, i32 0 @@ -359,7 +436,11 @@ define <16 x double> @vfwmul_vf_v16f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vfwmul.vf v8, v16, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = insertelement <16 x float> poison, float %y, i32 0 @@ -373,14 +454,35 @@ define <16 x double> @vfwmul_vf_v16f32(ptr %x, float %y) { define <32 x double> @vfwmul_vf_v32f32(ptr %x, float %y) { ; CHECK-LABEL: vfwmul_vf_v32f32: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwmul.vf v8, v16, fa0 -; CHECK-NEXT: vfwmul.vf v16, v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmul.vv v8, v0, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfmul.vv v16, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 @@ -395,8 +497,10 @@ define <2 x float> @vfwmul_squared_v2f16_v2f32(ptr %x) { ; CHECK-LABEL: vfwmul_squared_v2f16_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwmul.vv v8, v9, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vv v8, v9, v9 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = fpext <2 x half> %a to <2 x float> @@ -408,8 +512,10 @@ define <2 x double> @vfwmul_squared_v2f32_v2f64(ptr %x) { ; CHECK-LABEL: vfwmul_squared_v2f32_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vfwmul.vv v8, v9, v9 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmul.vv v8, v9, v9 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = fpext <2 x float> %a to <2 x double> @@ -424,7 +530,9 @@ define <2 x double> @vfwmul_squared_v2f16_v2f64(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vfwcvt.f.f.v v9, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vfwmul.vv v8, v9, v9 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfmul.vv v8, v8, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = fpext <2 x half> %a to <2 x double> @@ -436,8 +544,10 @@ define <2 x float> @vfwmul_vf2_v2f32(<2 x half> %x, half %y) { ; CHECK-LABEL: vfwmul_vf2_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfwmul.vf v9, v8, fa0 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vfwcvt.f.f.v v9, v8 +; CHECK-NEXT: fcvt.s.h fa5, fa0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfmul.vf v8, v9, fa5 ; CHECK-NEXT: ret %a = fpext <2 x half> %x to <2 x float> %b = fpext half %y to float diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll index 25f6b5ab27411..6cec6302654d4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll @@ -8,9 +8,12 @@ define <2 x float> @vfwsub_v2f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwsub.vv v8, v9, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = load <2 x half>, ptr %y @@ -24,9 +27,12 @@ define <4 x float> @vfwsub_v4f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwsub.vv v8, v9, v10 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b = load <4 x half>, ptr %y @@ -41,8 +47,11 @@ define <8 x float> @vfwsub_v8f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vfwsub.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x %b = load <8 x half>, ptr %y @@ -57,8 +66,11 @@ define <16 x float> @vfwsub_v16f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vfwsub.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x half>, ptr %x %b = load <16 x half>, ptr %y @@ -74,8 +86,11 @@ define <32 x float> @vfwsub_v32f16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vfwsub.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x half>, ptr %x %b = load <32 x half>, ptr %y @@ -97,18 +112,27 @@ define <64 x float> @vfwsub_v64f16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwsub.vv v8, v16, v24 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwsub.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v0 +; CHECK-NEXT: vfsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -128,9 +152,12 @@ define <2 x double> @vfwsub_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vfwsub.vv v8, v9, v10 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = load <2 x float>, ptr %y @@ -145,8 +172,11 @@ define <4 x double> @vfwsub_v4f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vfwsub.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x float>, ptr %y @@ -161,8 +191,11 @@ define <8 x double> @vfwsub_v8f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vfwsub.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = load <8 x float>, ptr %y @@ -177,8 +210,11 @@ define <16 x double> @vfwsub_v16f32(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vfwsub.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = load <16 x float>, ptr %y @@ -200,16 +236,25 @@ define <32 x double> @vfwsub_v32f32(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfwcvt.f.f.v v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwsub.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwsub.vv v16, v24, v0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vfwcvt.f.f.v v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v0 +; CHECK-NEXT: vfsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -229,8 +274,12 @@ define <2 x float> @vfwsub_vf_v2f16(ptr %x, half %y) { ; CHECK-LABEL: vfwsub_vf_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwsub.vf v8, v9, fa0 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x half>, ptr %x %b = insertelement <2 x half> poison, half %y, i32 0 @@ -245,8 +294,12 @@ define <4 x float> @vfwsub_vf_v4f16(ptr %x, half %y) { ; CHECK-LABEL: vfwsub_vf_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vfwsub.vf v8, v9, fa0 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x half>, ptr %x %b = insertelement <4 x half> poison, half %y, i32 0 @@ -262,7 +315,11 @@ define <8 x float> @vfwsub_vf_v8f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vfwsub.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x half>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 @@ -278,7 +335,11 @@ define <16 x float> @vfwsub_vf_v16f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vfwsub.vf v8, v12, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x half>, ptr %x %b = insertelement <16 x half> poison, half %y, i32 0 @@ -295,7 +356,11 @@ define <32 x float> @vfwsub_vf_v32f16(ptr %x, half %y) { ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vfwsub.vf v8, v16, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x half>, ptr %x %b = insertelement <32 x half> poison, half %y, i32 0 @@ -310,8 +375,12 @@ define <2 x double> @vfwsub_vf_v2f32(ptr %x, float %y) { ; CHECK-LABEL: vfwsub_vf_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vfwsub.vf v8, v9, fa0 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = insertelement <2 x float> poison, float %y, i32 0 @@ -327,7 +396,11 @@ define <4 x double> @vfwsub_vf_v4f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vfwsub.vf v8, v10, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = insertelement <4 x float> poison, float %y, i32 0 @@ -343,7 +416,11 @@ define <8 x double> @vfwsub_vf_v8f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vfwsub.vf v8, v12, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = insertelement <8 x float> poison, float %y, i32 0 @@ -359,7 +436,11 @@ define <16 x double> @vfwsub_vf_v16f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vfwsub.vf v8, v16, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = insertelement <16 x float> poison, float %y, i32 0 @@ -373,14 +454,35 @@ define <16 x double> @vfwsub_vf_v16f32(ptr %x, float %y) { define <32 x double> @vfwsub_vf_v32f32(ptr %x, float %y) { ; CHECK-LABEL: vfwsub_vf_v32f32: ; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v8, 16 ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vfwsub.vf v8, v16, fa0 -; CHECK-NEXT: vfwsub.vf v16, v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v0, v8 +; CHECK-NEXT: vfwcvt.f.f.v v8, v24 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v0, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfsub.vv v16, v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = insertelement <32 x float> poison, float %y, i32 0 @@ -395,9 +497,11 @@ define <2 x float> @vfwsub_wv_v2f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v9, v10 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = load <2 x half>, ptr %y @@ -410,9 +514,11 @@ define <4 x float> @vfwsub_wv_v4f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a1) +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v9, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = load <4 x half>, ptr %y @@ -425,9 +531,11 @@ define <8 x float> @vfwsub_wv_v8f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v8f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v12, (a1) ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = load <8 x half>, ptr %y @@ -440,9 +548,11 @@ define <16 x float> @vfwsub_wv_v16f16(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v16f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v16, (a1) ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v12, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = load <16 x half>, ptr %y @@ -456,9 +566,11 @@ define <32 x float> @vfwsub_wv_v32f16(ptr %x, ptr %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vle16.v v24, (a1) ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vle16.v v16, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x float>, ptr %x %b = load <32 x half>, ptr %y @@ -471,9 +583,11 @@ define <2 x double> @vfwsub_wv_v2f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v9, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v9 +; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vfwcvt.f.f.v v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v9, v10 ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = load <2 x float>, ptr %y @@ -486,9 +600,11 @@ define <4 x double> @vfwsub_wv_v4f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v12, (a1) ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v10 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x double>, ptr %x %b = load <4 x float>, ptr %y @@ -501,9 +617,11 @@ define <8 x double> @vfwsub_wv_v8f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v8f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v16, (a1) ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v12, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v12 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x double>, ptr %x %b = load <8 x float>, ptr %y @@ -516,9 +634,11 @@ define <16 x double> @vfwsub_wv_v16f32(ptr %x, ptr %y) { ; CHECK-LABEL: vfwsub_wv_v16f32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: vfwsub.wv v8, v8, v16 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x double>, ptr %x %b = load <16 x float>, ptr %y @@ -532,7 +652,10 @@ define <2 x float> @vfwsub_wf_v2f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <2 x float>, ptr %x %b = insertelement <2 x half> poison, half %y, i32 0 @@ -547,7 +670,10 @@ define <4 x float> @vfwsub_wf_v4f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x float>, ptr %x %b = insertelement <4 x half> poison, half %y, i32 0 @@ -562,7 +688,10 @@ define <8 x float> @vfwsub_wf_v8f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x float>, ptr %x %b = insertelement <8 x half> poison, half %y, i32 0 @@ -577,7 +706,10 @@ define <16 x float> @vfwsub_wf_v16f16(ptr %x, half %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x float>, ptr %x %b = insertelement <16 x half> poison, half %y, i32 0 @@ -592,7 +724,10 @@ define <2 x double> @vfwsub_wf_v2f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v9, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v9 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <2 x double>, ptr %x %b = insertelement <2 x float> poison, float %y, i32 0 @@ -607,7 +742,10 @@ define <4 x double> @vfwsub_wf_v4f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v12, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v10, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x double>, ptr %x %b = insertelement <4 x float> poison, float %y, i32 0 @@ -622,7 +760,10 @@ define <8 x double> @vfwsub_wf_v8f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v16, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x double>, ptr %x %b = insertelement <8 x float> poison, float %y, i32 0 @@ -637,7 +778,10 @@ define <16 x double> @vfwsub_wf_v16f32(ptr %x, float %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vfwsub.wf v8, v8, fa0 +; CHECK-NEXT: vfmv.v.f v24, fa0 +; CHECK-NEXT: vfwcvt.f.f.v v16, v24 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vfsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x double>, ptr %x %b = insertelement <16 x float> poison, float %y, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll index f5a31d7eaadbe..522195fe06cc2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmacc-vp.ll @@ -12,8 +12,11 @@ declare <2 x i8> @llvm.vp.select.nxv2i8(<2 x i1>, <2 x i8>, <2 x i8>, i32) define <2 x i8> @vmacc_vv_nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -25,8 +28,11 @@ define <2 x i8> @vmacc_vv_nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1 define <2 x i8> @vmacc_vv_nxv2i8_unmasked(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -38,8 +44,11 @@ define <2 x i8> @vmacc_vv_nxv2i8_unmasked(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, define <2 x i8> @vmacc_vx_nxv2i8(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 @@ -53,8 +62,11 @@ define <2 x i8> @vmacc_vx_nxv2i8(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, define <2 x i8> @vmacc_vx_nxv2i8_unmasked(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 @@ -68,9 +80,10 @@ define <2 x i8> @vmacc_vx_nxv2i8_unmasked(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x define <2 x i8> @vmacc_vv_nxv2i8_ta(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i8> @llvm.vp.add.nxv2i8(<2 x i8> %x, <2 x i8> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -81,9 +94,10 @@ define <2 x i8> @vmacc_vv_nxv2i8_ta(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x define <2 x i8> @vmacc_vx_nxv2i8_ta(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer @@ -101,8 +115,11 @@ declare <4 x i8> @llvm.vp.select.nxv4i8(<4 x i1>, <4 x i8>, <4 x i8>, i32) define <4 x i8> @vmacc_vv_nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i8> @llvm.vp.mul.nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -114,8 +131,11 @@ define <4 x i8> @vmacc_vv_nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1 define <4 x i8> @vmacc_vv_nxv4i8_unmasked(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i8> @llvm.vp.mul.nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -127,8 +147,11 @@ define <4 x i8> @vmacc_vv_nxv4i8_unmasked(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, define <4 x i8> @vmacc_vx_nxv4i8(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 @@ -142,8 +165,11 @@ define <4 x i8> @vmacc_vx_nxv4i8(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, define <4 x i8> @vmacc_vx_nxv4i8_unmasked(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 @@ -157,9 +183,10 @@ define <4 x i8> @vmacc_vx_nxv4i8_unmasked(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x define <4 x i8> @vmacc_vv_nxv4i8_ta(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i8> @llvm.vp.mul.nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i8> @llvm.vp.add.nxv4i8(<4 x i8> %x, <4 x i8> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -170,9 +197,10 @@ define <4 x i8> @vmacc_vv_nxv4i8_ta(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x define <4 x i8> @vmacc_vx_nxv4i8_ta(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer @@ -190,8 +218,11 @@ declare <8 x i8> @llvm.vp.select.nxv8i8(<8 x i1>, <8 x i8>, <8 x i8>, i32) define <8 x i8> @vmacc_vv_nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.vp.mul.nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -203,8 +234,11 @@ define <8 x i8> @vmacc_vv_nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1 define <8 x i8> @vmacc_vv_nxv8i8_unmasked(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.vp.mul.nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -216,8 +250,11 @@ define <8 x i8> @vmacc_vv_nxv8i8_unmasked(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, define <8 x i8> @vmacc_vx_nxv8i8(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 @@ -231,8 +268,11 @@ define <8 x i8> @vmacc_vx_nxv8i8(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, define <8 x i8> @vmacc_vx_nxv8i8_unmasked(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 @@ -246,9 +286,10 @@ define <8 x i8> @vmacc_vx_nxv8i8_unmasked(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x define <8 x i8> @vmacc_vv_nxv8i8_ta(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.vp.mul.nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i8> @llvm.vp.add.nxv8i8(<8 x i8> %x, <8 x i8> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -259,9 +300,10 @@ define <8 x i8> @vmacc_vv_nxv8i8_ta(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x define <8 x i8> @vmacc_vx_nxv8i8_ta(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -279,8 +321,11 @@ declare <16 x i8> @llvm.vp.select.nxv16i8(<16 x i1>, <16 x i8>, <16 x i8>, i32) define <16 x i8> @vmacc_vv_nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.vp.mul.nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -292,8 +337,11 @@ define <16 x i8> @vmacc_vv_nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1 define <16 x i8> @vmacc_vv_nxv16i8_unmasked(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.vp.mul.nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -305,8 +353,11 @@ define <16 x i8> @vmacc_vv_nxv16i8_unmasked(<16 x i8> %a, <16 x i8> %b, <16 x i8 define <16 x i8> @vmacc_vx_nxv16i8(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 @@ -320,8 +371,11 @@ define <16 x i8> @vmacc_vx_nxv16i8(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> define <16 x i8> @vmacc_vx_nxv16i8_unmasked(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 @@ -335,9 +389,10 @@ define <16 x i8> @vmacc_vx_nxv16i8_unmasked(<16 x i8> %a, i8 %b, <16 x i8> %c, define <16 x i8> @vmacc_vv_nxv16i8_ta(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.vp.mul.nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i1> splat (i1 -1), i32 %evl) %y = call <16 x i8> @llvm.vp.add.nxv16i8(<16 x i8> %x, <16 x i8> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -348,9 +403,10 @@ define <16 x i8> @vmacc_vv_nxv16i8_ta(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, define <16 x i8> @vmacc_vx_nxv16i8_ta(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer @@ -368,8 +424,11 @@ declare <32 x i8> @llvm.vp.select.nxv32i8(<32 x i1>, <32 x i8>, <32 x i8>, i32) define <32 x i8> @vmacc_vv_nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <32 x i8> @llvm.vp.mul.nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -381,8 +440,11 @@ define <32 x i8> @vmacc_vv_nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 define <32 x i8> @vmacc_vv_nxv32i8_unmasked(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv32i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, ma -; CHECK-NEXT: vmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <32 x i8> @llvm.vp.mul.nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -394,8 +456,11 @@ define <32 x i8> @vmacc_vv_nxv32i8_unmasked(<32 x i8> %a, <32 x i8> %b, <32 x i8 define <32 x i8> @vmacc_vx_nxv32i8(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu -; CHECK-NEXT: vmacc.vx v10, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i8> poison, i8 %b, i32 0 @@ -409,8 +474,11 @@ define <32 x i8> @vmacc_vx_nxv32i8(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> define <32 x i8> @vmacc_vx_nxv32i8_unmasked(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv32i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, ma -; CHECK-NEXT: vmacc.vx v10, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i8> poison, i8 %b, i32 0 @@ -424,9 +492,10 @@ define <32 x i8> @vmacc_vx_nxv32i8_unmasked(<32 x i8> %a, i8 %b, <32 x i8> %c, define <32 x i8> @vmacc_vv_nxv32i8_ta(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv32i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <32 x i8> @llvm.vp.mul.nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i1> splat (i1 -1), i32 %evl) %y = call <32 x i8> @llvm.vp.add.nxv32i8(<32 x i8> %x, <32 x i8> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -437,9 +506,10 @@ define <32 x i8> @vmacc_vv_nxv32i8_ta(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, define <32 x i8> @vmacc_vx_nxv32i8_ta(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv32i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vmacc.vx v10, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i8> poison, i8 %b, i32 0 %vb = shufflevector <32 x i8> %elt.head, <32 x i8> poison, <32 x i32> zeroinitializer @@ -457,8 +527,11 @@ declare <64 x i8> @llvm.vp.select.nxv64i8(<64 x i1>, <64 x i8>, <64 x i8>, i32) define <64 x i8> @vmacc_vv_nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <64 x i8> @llvm.vp.mul.nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i1> splat (i1 -1), i32 %evl) @@ -470,8 +543,11 @@ define <64 x i8> @vmacc_vv_nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <6 define <64 x i8> @vmacc_vv_nxv64i8_unmasked(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv64i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma -; CHECK-NEXT: vmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <64 x i8> @llvm.vp.mul.nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i1> splat (i1 -1), i32 %evl) @@ -483,8 +559,11 @@ define <64 x i8> @vmacc_vv_nxv64i8_unmasked(<64 x i8> %a, <64 x i8> %b, <64 x i8 define <64 x i8> @vmacc_vx_nxv64i8(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu -; CHECK-NEXT: vmacc.vx v12, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i8> poison, i8 %b, i32 0 @@ -498,8 +577,11 @@ define <64 x i8> @vmacc_vx_nxv64i8(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> define <64 x i8> @vmacc_vx_nxv64i8_unmasked(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv64i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-NEXT: vmacc.vx v12, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i8> poison, i8 %b, i32 0 @@ -513,9 +595,10 @@ define <64 x i8> @vmacc_vx_nxv64i8_unmasked(<64 x i8> %a, i8 %b, <64 x i8> %c, define <64 x i8> @vmacc_vv_nxv64i8_ta(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv64i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <64 x i8> @llvm.vp.mul.nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i1> splat (i1 -1), i32 %evl) %y = call <64 x i8> @llvm.vp.add.nxv64i8(<64 x i8> %x, <64 x i8> %c, <64 x i1> splat (i1 -1), i32 %evl) @@ -526,9 +609,10 @@ define <64 x i8> @vmacc_vv_nxv64i8_ta(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, define <64 x i8> @vmacc_vx_nxv64i8_ta(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv64i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vmacc.vx v12, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i8> poison, i8 %b, i32 0 %vb = shufflevector <64 x i8> %elt.head, <64 x i8> poison, <64 x i32> zeroinitializer @@ -546,8 +630,11 @@ declare <2 x i16> @llvm.vp.select.nxv2i16(<2 x i1>, <2 x i16>, <2 x i16>, i32) define <2 x i16> @vmacc_vv_nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i16> @llvm.vp.mul.nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -559,8 +646,11 @@ define <2 x i16> @vmacc_vv_nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 define <2 x i16> @vmacc_vv_nxv2i16_unmasked(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i16> @llvm.vp.mul.nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -572,8 +662,11 @@ define <2 x i16> @vmacc_vv_nxv2i16_unmasked(<2 x i16> %a, <2 x i16> %b, <2 x i16 define <2 x i16> @vmacc_vx_nxv2i16(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 @@ -587,8 +680,11 @@ define <2 x i16> @vmacc_vx_nxv2i16(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> define <2 x i16> @vmacc_vx_nxv2i16_unmasked(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 @@ -602,9 +698,10 @@ define <2 x i16> @vmacc_vx_nxv2i16_unmasked(<2 x i16> %a, i16 %b, <2 x i16> %c, define <2 x i16> @vmacc_vv_nxv2i16_ta(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i16> @llvm.vp.mul.nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i16> @llvm.vp.add.nxv2i16(<2 x i16> %x, <2 x i16> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -615,9 +712,10 @@ define <2 x i16> @vmacc_vv_nxv2i16_ta(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, define <2 x i16> @vmacc_vx_nxv2i16_ta(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer @@ -635,8 +733,11 @@ declare <4 x i16> @llvm.vp.select.nxv4i16(<4 x i1>, <4 x i16>, <4 x i16>, i32) define <4 x i16> @vmacc_vv_nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i16> @llvm.vp.mul.nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -648,8 +749,11 @@ define <4 x i16> @vmacc_vv_nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 define <4 x i16> @vmacc_vv_nxv4i16_unmasked(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i16> @llvm.vp.mul.nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -661,8 +765,11 @@ define <4 x i16> @vmacc_vv_nxv4i16_unmasked(<4 x i16> %a, <4 x i16> %b, <4 x i16 define <4 x i16> @vmacc_vx_nxv4i16(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 @@ -676,8 +783,11 @@ define <4 x i16> @vmacc_vx_nxv4i16(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> define <4 x i16> @vmacc_vx_nxv4i16_unmasked(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 @@ -691,9 +801,10 @@ define <4 x i16> @vmacc_vx_nxv4i16_unmasked(<4 x i16> %a, i16 %b, <4 x i16> %c, define <4 x i16> @vmacc_vv_nxv4i16_ta(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i16> @llvm.vp.mul.nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i16> @llvm.vp.add.nxv4i16(<4 x i16> %x, <4 x i16> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -704,9 +815,10 @@ define <4 x i16> @vmacc_vv_nxv4i16_ta(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, define <4 x i16> @vmacc_vx_nxv4i16_ta(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer @@ -724,8 +836,11 @@ declare <8 x i16> @llvm.vp.select.nxv8i16(<8 x i1>, <8 x i16>, <8 x i16>, i32) define <8 x i16> @vmacc_vv_nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.vp.mul.nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -737,8 +852,11 @@ define <8 x i16> @vmacc_vv_nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 define <8 x i16> @vmacc_vv_nxv8i16_unmasked(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.vp.mul.nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -750,8 +868,11 @@ define <8 x i16> @vmacc_vv_nxv8i16_unmasked(<8 x i16> %a, <8 x i16> %b, <8 x i16 define <8 x i16> @vmacc_vx_nxv8i16(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 @@ -765,8 +886,11 @@ define <8 x i16> @vmacc_vx_nxv8i16(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> define <8 x i16> @vmacc_vx_nxv8i16_unmasked(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 @@ -780,9 +904,10 @@ define <8 x i16> @vmacc_vx_nxv8i16_unmasked(<8 x i16> %a, i16 %b, <8 x i16> %c, define <8 x i16> @vmacc_vv_nxv8i16_ta(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.vp.mul.nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i16> @llvm.vp.add.nxv8i16(<8 x i16> %x, <8 x i16> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -793,9 +918,10 @@ define <8 x i16> @vmacc_vv_nxv8i16_ta(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, define <8 x i16> @vmacc_vx_nxv8i16_ta(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer @@ -813,8 +939,11 @@ declare <16 x i16> @llvm.vp.select.nxv16i16(<16 x i1>, <16 x i16>, <16 x i16>, i define <16 x i16> @vmacc_vv_nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.vp.mul.nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -826,8 +955,11 @@ define <16 x i16> @vmacc_vv_nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c define <16 x i16> @vmacc_vv_nxv16i16_unmasked(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.vp.mul.nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -839,8 +971,11 @@ define <16 x i16> @vmacc_vv_nxv16i16_unmasked(<16 x i16> %a, <16 x i16> %b, <16 define <16 x i16> @vmacc_vx_nxv16i16(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m2, tu, mu -; CHECK-NEXT: vmacc.vx v10, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 @@ -854,8 +989,11 @@ define <16 x i16> @vmacc_vx_nxv16i16(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 define <16 x i16> @vmacc_vx_nxv16i16_unmasked(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m2, tu, ma -; CHECK-NEXT: vmacc.vx v10, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 @@ -869,9 +1007,10 @@ define <16 x i16> @vmacc_vx_nxv16i16_unmasked(<16 x i16> %a, i16 %b, <16 x i16> define <16 x i16> @vmacc_vv_nxv16i16_ta(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.vp.mul.nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i1> splat (i1 -1), i32 %evl) %y = call <16 x i16> @llvm.vp.add.nxv16i16(<16 x i16> %x, <16 x i16> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -882,9 +1021,10 @@ define <16 x i16> @vmacc_vv_nxv16i16_ta(<16 x i16> %a, <16 x i16> %b, <16 x i16> define <16 x i16> @vmacc_vx_nxv16i16_ta(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu -; CHECK-NEXT: vmacc.vx v10, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer @@ -902,8 +1042,11 @@ declare <32 x i16> @llvm.vp.select.nxv32i16(<32 x i1>, <32 x i16>, <32 x i16>, i define <32 x i16> @vmacc_vv_nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <32 x i16> @llvm.vp.mul.nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -915,8 +1058,11 @@ define <32 x i16> @vmacc_vv_nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c define <32 x i16> @vmacc_vv_nxv32i16_unmasked(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv32i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <32 x i16> @llvm.vp.mul.nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -928,8 +1074,11 @@ define <32 x i16> @vmacc_vv_nxv32i16_unmasked(<32 x i16> %a, <32 x i16> %b, <32 define <32 x i16> @vmacc_vx_nxv32i16(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu -; CHECK-NEXT: vmacc.vx v12, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i16> poison, i16 %b, i32 0 @@ -943,8 +1092,11 @@ define <32 x i16> @vmacc_vx_nxv32i16(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 define <32 x i16> @vmacc_vx_nxv32i16_unmasked(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv32i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, ma -; CHECK-NEXT: vmacc.vx v12, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i16> poison, i16 %b, i32 0 @@ -958,9 +1110,10 @@ define <32 x i16> @vmacc_vx_nxv32i16_unmasked(<32 x i16> %a, i16 %b, <32 x i16> define <32 x i16> @vmacc_vv_nxv32i16_ta(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv32i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <32 x i16> @llvm.vp.mul.nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i1> splat (i1 -1), i32 %evl) %y = call <32 x i16> @llvm.vp.add.nxv32i16(<32 x i16> %x, <32 x i16> %c, <32 x i1> splat (i1 -1), i32 %evl) @@ -971,9 +1124,10 @@ define <32 x i16> @vmacc_vv_nxv32i16_ta(<32 x i16> %a, <32 x i16> %b, <32 x i16> define <32 x i16> @vmacc_vx_nxv32i16_ta(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv32i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vmacc.vx v12, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i16> poison, i16 %b, i32 0 %vb = shufflevector <32 x i16> %elt.head, <32 x i16> poison, <32 x i32> zeroinitializer @@ -991,8 +1145,11 @@ declare <2 x i32> @llvm.vp.select.nxv2i32(<2 x i1>, <2 x i32>, <2 x i32>, i32) define <2 x i32> @vmacc_vv_nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i32> @llvm.vp.mul.nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1004,8 +1161,11 @@ define <2 x i32> @vmacc_vv_nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 define <2 x i32> @vmacc_vv_nxv2i32_unmasked(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i32> @llvm.vp.mul.nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1017,8 +1177,11 @@ define <2 x i32> @vmacc_vv_nxv2i32_unmasked(<2 x i32> %a, <2 x i32> %b, <2 x i32 define <2 x i32> @vmacc_vx_nxv2i32(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 @@ -1032,8 +1195,11 @@ define <2 x i32> @vmacc_vx_nxv2i32(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> define <2 x i32> @vmacc_vx_nxv2i32_unmasked(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 @@ -1047,9 +1213,10 @@ define <2 x i32> @vmacc_vx_nxv2i32_unmasked(<2 x i32> %a, i32 %b, <2 x i32> %c, define <2 x i32> @vmacc_vv_nxv2i32_ta(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i32> @llvm.vp.mul.nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i32> @llvm.vp.add.nxv2i32(<2 x i32> %x, <2 x i32> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1060,9 +1227,10 @@ define <2 x i32> @vmacc_vv_nxv2i32_ta(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, define <2 x i32> @vmacc_vx_nxv2i32_ta(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv2i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer @@ -1080,8 +1248,11 @@ declare <4 x i32> @llvm.vp.select.nxv4i32(<4 x i1>, <4 x i32>, <4 x i32>, i32) define <4 x i32> @vmacc_vv_nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i32> @llvm.vp.mul.nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1093,8 +1264,11 @@ define <4 x i32> @vmacc_vv_nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 define <4 x i32> @vmacc_vv_nxv4i32_unmasked(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i32> @llvm.vp.mul.nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1106,8 +1280,11 @@ define <4 x i32> @vmacc_vv_nxv4i32_unmasked(<4 x i32> %a, <4 x i32> %b, <4 x i32 define <4 x i32> @vmacc_vx_nxv4i32(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 @@ -1121,8 +1298,11 @@ define <4 x i32> @vmacc_vx_nxv4i32(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> define <4 x i32> @vmacc_vx_nxv4i32_unmasked(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma -; CHECK-NEXT: vmacc.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 @@ -1136,9 +1316,10 @@ define <4 x i32> @vmacc_vx_nxv4i32_unmasked(<4 x i32> %a, i32 %b, <4 x i32> %c, define <4 x i32> @vmacc_vv_nxv4i32_ta(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i32> @llvm.vp.mul.nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i32> @llvm.vp.add.nxv4i32(<4 x i32> %x, <4 x i32> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1149,9 +1330,10 @@ define <4 x i32> @vmacc_vv_nxv4i32_ta(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, define <4 x i32> @vmacc_vx_nxv4i32_ta(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv4i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu -; CHECK-NEXT: vmacc.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer @@ -1169,8 +1351,11 @@ declare <8 x i32> @llvm.vp.select.nxv8i32(<8 x i1>, <8 x i32>, <8 x i32>, i32) define <8 x i32> @vmacc_vv_nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <8 x i32> @llvm.vp.mul.nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1182,8 +1367,11 @@ define <8 x i32> @vmacc_vv_nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 define <8 x i32> @vmacc_vv_nxv8i32_unmasked(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <8 x i32> @llvm.vp.mul.nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1195,8 +1383,11 @@ define <8 x i32> @vmacc_vv_nxv8i32_unmasked(<8 x i32> %a, <8 x i32> %b, <8 x i32 define <8 x i32> @vmacc_vx_nxv8i32(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu -; CHECK-NEXT: vmacc.vx v10, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 @@ -1210,8 +1401,11 @@ define <8 x i32> @vmacc_vx_nxv8i32(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> define <8 x i32> @vmacc_vx_nxv8i32_unmasked(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vmacc.vx v10, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 @@ -1225,9 +1419,10 @@ define <8 x i32> @vmacc_vx_nxv8i32_unmasked(<8 x i32> %a, i32 %b, <8 x i32> %c, define <8 x i32> @vmacc_vv_nxv8i32_ta(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i32> @llvm.vp.mul.nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i32> @llvm.vp.add.nxv8i32(<8 x i32> %x, <8 x i32> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1238,9 +1433,10 @@ define <8 x i32> @vmacc_vv_nxv8i32_ta(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, define <8 x i32> @vmacc_vx_nxv8i32_ta(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv8i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu -; CHECK-NEXT: vmacc.vx v10, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer @@ -1258,8 +1454,11 @@ declare <16 x i32> @llvm.vp.select.nxv16i32(<16 x i1>, <16 x i32>, <16 x i32>, i define <16 x i32> @vmacc_vv_nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <16 x i32> @llvm.vp.mul.nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -1271,8 +1470,11 @@ define <16 x i32> @vmacc_vv_nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c define <16 x i32> @vmacc_vv_nxv16i32_unmasked(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <16 x i32> @llvm.vp.mul.nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -1284,8 +1486,11 @@ define <16 x i32> @vmacc_vv_nxv16i32_unmasked(<16 x i32> %a, <16 x i32> %b, <16 define <16 x i32> @vmacc_vx_nxv16i32(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu -; CHECK-NEXT: vmacc.vx v12, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 @@ -1299,8 +1504,11 @@ define <16 x i32> @vmacc_vx_nxv16i32(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 define <16 x i32> @vmacc_vx_nxv16i32_unmasked(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, ma -; CHECK-NEXT: vmacc.vx v12, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 @@ -1314,9 +1522,10 @@ define <16 x i32> @vmacc_vx_nxv16i32_unmasked(<16 x i32> %a, i32 %b, <16 x i32> define <16 x i32> @vmacc_vv_nxv16i32_ta(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv16i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <16 x i32> @llvm.vp.mul.nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i1> splat (i1 -1), i32 %evl) %y = call <16 x i32> @llvm.vp.add.nxv16i32(<16 x i32> %x, <16 x i32> %c, <16 x i1> splat (i1 -1), i32 %evl) @@ -1327,9 +1536,10 @@ define <16 x i32> @vmacc_vv_nxv16i32_ta(<16 x i32> %a, <16 x i32> %b, <16 x i32> define <16 x i32> @vmacc_vx_nxv16i32_ta(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vx_nxv16i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; CHECK-NEXT: vmacc.vx v12, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer @@ -1347,8 +1557,11 @@ declare <2 x i64> @llvm.vp.select.nxv2i64(<2 x i1>, <2 x i64>, <2 x i64>, i32) define <2 x i64> @vmacc_vv_nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i64> @llvm.vp.mul.nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1360,8 +1573,11 @@ define <2 x i64> @vmacc_vv_nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 define <2 x i64> @vmacc_vv_nxv2i64_unmasked(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vmacc.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i64> @llvm.vp.mul.nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1380,8 +1596,11 @@ define <2 x i64> @vmacc_vx_nxv2i64(<2 x i64> %a, i64 %b, <2 x i64> %c, <2 x i1> ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu -; RV32-NEXT: vmacc.vv v9, v8, v10, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV32-NEXT: vmerge.vvm v9, v9, v8, v0 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1389,8 +1608,11 @@ define <2 x i64> @vmacc_vx_nxv2i64(<2 x i64> %a, i64 %b, <2 x i64> %c, <2 x i1> ; ; RV64-LABEL: vmacc_vx_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu -; RV64-NEXT: vmacc.vx v9, a0, v8, v0.t +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV64-NEXT: vmerge.vvm v9, v9, v8, v0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 @@ -1411,8 +1633,11 @@ define <2 x i64> @vmacc_vx_nxv2i64_unmasked(<2 x i64> %a, i64 %b, <2 x i64> %c, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, ma -; RV32-NEXT: vmacc.vv v9, v8, v10 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV32-NEXT: vmv.v.v v9, v8 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1420,8 +1645,11 @@ define <2 x i64> @vmacc_vx_nxv2i64_unmasked(<2 x i64> %a, i64 %b, <2 x i64> %c, ; ; RV64-LABEL: vmacc_vx_nxv2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, ma -; RV64-NEXT: vmacc.vx v9, a0, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV64-NEXT: vmv.v.v v9, v8 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 @@ -1435,9 +1663,10 @@ define <2 x i64> @vmacc_vx_nxv2i64_unmasked(<2 x i64> %a, i64 %b, <2 x i64> %c, define <2 x i64> @vmacc_vv_nxv2i64_ta(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv2i64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vmacc.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i64> @llvm.vp.mul.nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i64> @llvm.vp.add.nxv2i64(<2 x i64> %x, <2 x i64> %c, <2 x i1> splat (i1 -1), i32 %evl) @@ -1455,18 +1684,20 @@ define <2 x i64> @vmacc_vx_nxv2i64_ta(<2 x i64> %a, i64 %b, <2 x i64> %c, <2 x ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu -; RV32-NEXT: vmacc.vv v9, v8, v10, v0.t -; RV32-NEXT: vmv.v.v v8, v9 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vadd.vv v8, v8, v9 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vmacc_vx_nxv2i64_ta: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu -; RV64-NEXT: vmacc.vx v9, a0, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v9 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v9 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV64-NEXT: ret %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer @@ -1484,8 +1715,11 @@ declare <4 x i64> @llvm.vp.select.nxv4i64(<4 x i1>, <4 x i64>, <4 x i64>, i32) define <4 x i64> @vmacc_vv_nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <4 x i64> @llvm.vp.mul.nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1497,8 +1731,11 @@ define <4 x i64> @vmacc_vv_nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 define <4 x i64> @vmacc_vv_nxv4i64_unmasked(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vmacc.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <4 x i64> @llvm.vp.mul.nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1517,8 +1754,11 @@ define <4 x i64> @vmacc_vx_nxv4i64(<4 x i64> %a, i64 %b, <4 x i64> %c, <4 x i1> ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu -; RV32-NEXT: vmacc.vv v10, v8, v12, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV32-NEXT: vmerge.vvm v10, v10, v8, v0 ; RV32-NEXT: vmv2r.v v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1526,8 +1766,11 @@ define <4 x i64> @vmacc_vx_nxv4i64(<4 x i64> %a, i64 %b, <4 x i64> %c, <4 x i1> ; ; RV64-LABEL: vmacc_vx_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu -; RV64-NEXT: vmacc.vx v10, a0, v8, v0.t +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV64-NEXT: vmerge.vvm v10, v10, v8, v0 ; RV64-NEXT: vmv2r.v v8, v10 ; RV64-NEXT: ret %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 @@ -1548,8 +1791,11 @@ define <4 x i64> @vmacc_vx_nxv4i64_unmasked(<4 x i64> %a, i64 %b, <4 x i64> %c, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, ma -; RV32-NEXT: vmacc.vv v10, v8, v12 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV32-NEXT: vmv.v.v v10, v8 ; RV32-NEXT: vmv2r.v v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1557,8 +1803,11 @@ define <4 x i64> @vmacc_vx_nxv4i64_unmasked(<4 x i64> %a, i64 %b, <4 x i64> %c, ; ; RV64-LABEL: vmacc_vx_nxv4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, ma -; RV64-NEXT: vmacc.vx v10, a0, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV64-NEXT: vmv.v.v v10, v8 ; RV64-NEXT: vmv2r.v v8, v10 ; RV64-NEXT: ret %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 @@ -1572,9 +1821,10 @@ define <4 x i64> @vmacc_vx_nxv4i64_unmasked(<4 x i64> %a, i64 %b, <4 x i64> %c, define <4 x i64> @vmacc_vv_nxv4i64_ta(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv4i64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vmacc.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vadd.vv v8, v8, v12 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i64> @llvm.vp.mul.nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i64> @llvm.vp.add.nxv4i64(<4 x i64> %x, <4 x i64> %c, <4 x i1> splat (i1 -1), i32 %evl) @@ -1592,18 +1842,20 @@ define <4 x i64> @vmacc_vx_nxv4i64_ta(<4 x i64> %a, i64 %b, <4 x i64> %c, <4 x ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu -; RV32-NEXT: vmacc.vv v10, v8, v12, v0.t -; RV32-NEXT: vmv.v.v v8, v10 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vadd.vv v8, v8, v10 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vmacc_vx_nxv4i64_ta: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, mu -; RV64-NEXT: vmacc.vx v10, a0, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v10 +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v10 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: ret %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer @@ -1621,8 +1873,11 @@ declare <8 x i64> @llvm.vp.select.nxv8i64(<8 x i1>, <8 x i64>, <8 x i64>, i32) define <8 x i64> @vmacc_vv_nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <8 x i64> @llvm.vp.mul.nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1634,8 +1889,11 @@ define <8 x i64> @vmacc_vv_nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 define <8 x i64> @vmacc_vv_nxv8i64_unmasked(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vmacc.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <8 x i64> @llvm.vp.mul.nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1654,8 +1912,11 @@ define <8 x i64> @vmacc_vx_nxv8i64(<8 x i64> %a, i64 %b, <8 x i64> %c, <8 x i1> ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu -; RV32-NEXT: vmacc.vv v12, v8, v16, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV32-NEXT: vmerge.vvm v12, v12, v8, v0 ; RV32-NEXT: vmv4r.v v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1663,8 +1924,11 @@ define <8 x i64> @vmacc_vx_nxv8i64(<8 x i64> %a, i64 %b, <8 x i64> %c, <8 x i1> ; ; RV64-LABEL: vmacc_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu -; RV64-NEXT: vmacc.vx v12, a0, v8, v0.t +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV64-NEXT: vmerge.vvm v12, v12, v8, v0 ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 @@ -1685,8 +1949,11 @@ define <8 x i64> @vmacc_vx_nxv8i64_unmasked(<8 x i64> %a, i64 %b, <8 x i64> %c, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, ma -; RV32-NEXT: vmacc.vv v12, v8, v16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: vmv4r.v v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1694,8 +1961,11 @@ define <8 x i64> @vmacc_vx_nxv8i64_unmasked(<8 x i64> %a, i64 %b, <8 x i64> %c, ; ; RV64-LABEL: vmacc_vx_nxv8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, ma -; RV64-NEXT: vmacc.vx v12, a0, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV64-NEXT: vmv.v.v v12, v8 ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 @@ -1709,9 +1979,10 @@ define <8 x i64> @vmacc_vx_nxv8i64_unmasked(<8 x i64> %a, i64 %b, <8 x i64> %c, define <8 x i64> @vmacc_vv_nxv8i64_ta(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vmacc_vv_nxv8i64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vmacc.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vadd.vv v8, v8, v16 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i64> @llvm.vp.mul.nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i64> @llvm.vp.add.nxv8i64(<8 x i64> %x, <8 x i64> %c, <8 x i1> splat (i1 -1), i32 %evl) @@ -1729,18 +2000,20 @@ define <8 x i64> @vmacc_vx_nxv8i64_ta(<8 x i64> %a, i64 %b, <8 x i64> %c, <8 x ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu -; RV32-NEXT: vmacc.vv v12, v8, v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vadd.vv v8, v8, v12 +; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vmacc_vx_nxv8i64_ta: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu -; RV64-NEXT: vmacc.vx v12, a0, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vadd.vv v8, v8, v12 +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV64-NEXT: ret %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll index 02aaea4177836..8dfd7f0e23130 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll @@ -1194,12 +1194,11 @@ define <8 x i64> @vmul_vadd_vx_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %ev define <8 x i64> @vmul_vadd_vx_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) { ; CHECK-LABEL: vmul_vadd_vx_v8i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: li a1, 21 -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vmv.v.x v12, a1 ; CHECK-NEXT: li a1, 7 ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmadd.vx v8, a1, v12 +; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: li a0, 21 +; CHECK-NEXT: vadd.vx v8, v8, a0 ; CHECK-NEXT: ret %head = insertelement <8 x i1> poison, i1 true, i32 0 %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll index b8798fe6c63dc..c1f7857a9abda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnmsac-vp.ll @@ -12,8 +12,11 @@ declare <2 x i8> @llvm.vp.select.nxv2i8(<2 x i1>, <2 x i8>, <2 x i8>, i32) define <2 x i8> @vnmsac_vv_nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -25,8 +28,11 @@ define <2 x i8> @vnmsac_vv_nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i define <2 x i8> @vnmsac_vv_nxv2i8_unmasked(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -38,8 +44,11 @@ define <2 x i8> @vnmsac_vv_nxv2i8_unmasked(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c define <2 x i8> @vnmsac_vx_nxv2i8(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 @@ -53,8 +62,11 @@ define <2 x i8> @vnmsac_vx_nxv2i8(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, define <2 x i8> @vnmsac_vx_nxv2i8_unmasked(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 @@ -68,9 +80,10 @@ define <2 x i8> @vnmsac_vx_nxv2i8_unmasked(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 define <2 x i8> @vnmsac_vv_nxv2i8_ta(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i8> @llvm.vp.mul.nxv2i8(<2 x i8> %a, <2 x i8> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i8> @llvm.vp.sub.nxv2i8(<2 x i8> %c, <2 x i8> %x, <2 x i1> splat (i1 -1), i32 %evl) @@ -81,9 +94,10 @@ define <2 x i8> @vnmsac_vv_nxv2i8_ta(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 define <2 x i8> @vnmsac_vx_nxv2i8_ta(<2 x i8> %a, i8 %b, <2 x i8> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0 %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer @@ -101,8 +115,11 @@ declare <4 x i8> @llvm.vp.select.nxv4i8(<4 x i1>, <4 x i8>, <4 x i8>, i32) define <4 x i8> @vnmsac_vv_nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i8> @llvm.vp.mul.nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -114,8 +131,11 @@ define <4 x i8> @vnmsac_vv_nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i define <4 x i8> @vnmsac_vv_nxv4i8_unmasked(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i8> @llvm.vp.mul.nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -127,8 +147,11 @@ define <4 x i8> @vnmsac_vv_nxv4i8_unmasked(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c define <4 x i8> @vnmsac_vx_nxv4i8(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 @@ -142,8 +165,11 @@ define <4 x i8> @vnmsac_vx_nxv4i8(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, define <4 x i8> @vnmsac_vx_nxv4i8_unmasked(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 @@ -157,9 +183,10 @@ define <4 x i8> @vnmsac_vx_nxv4i8_unmasked(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 define <4 x i8> @vnmsac_vv_nxv4i8_ta(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i8> @llvm.vp.mul.nxv4i8(<4 x i8> %a, <4 x i8> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i8> @llvm.vp.sub.nxv4i8(<4 x i8> %c, <4 x i8> %x, <4 x i1> splat (i1 -1), i32 %evl) @@ -170,9 +197,10 @@ define <4 x i8> @vnmsac_vv_nxv4i8_ta(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 define <4 x i8> @vnmsac_vx_nxv4i8_ta(<4 x i8> %a, i8 %b, <4 x i8> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0 %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer @@ -190,8 +218,11 @@ declare <8 x i8> @llvm.vp.select.nxv8i8(<8 x i1>, <8 x i8>, <8 x i8>, i32) define <8 x i8> @vnmsac_vv_nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.vp.mul.nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -203,8 +234,11 @@ define <8 x i8> @vnmsac_vv_nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i define <8 x i8> @vnmsac_vv_nxv8i8_unmasked(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.vp.mul.nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -216,8 +250,11 @@ define <8 x i8> @vnmsac_vv_nxv8i8_unmasked(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c define <8 x i8> @vnmsac_vx_nxv8i8(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 @@ -231,8 +268,11 @@ define <8 x i8> @vnmsac_vx_nxv8i8(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, define <8 x i8> @vnmsac_vx_nxv8i8_unmasked(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 @@ -246,9 +286,10 @@ define <8 x i8> @vnmsac_vx_nxv8i8_unmasked(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 define <8 x i8> @vnmsac_vv_nxv8i8_ta(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i8> @llvm.vp.mul.nxv8i8(<8 x i8> %a, <8 x i8> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i8> @llvm.vp.sub.nxv8i8(<8 x i8> %c, <8 x i8> %x, <8 x i1> splat (i1 -1), i32 %evl) @@ -259,9 +300,10 @@ define <8 x i8> @vnmsac_vv_nxv8i8_ta(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 define <8 x i8> @vnmsac_vx_nxv8i8_ta(<8 x i8> %a, i8 %b, <8 x i8> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0 %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -279,8 +321,11 @@ declare <16 x i8> @llvm.vp.select.nxv16i8(<16 x i1>, <16 x i8>, <16 x i8>, i32) define <16 x i8> @vnmsac_vv_nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.vp.mul.nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -292,8 +337,11 @@ define <16 x i8> @vnmsac_vv_nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, < define <16 x i8> @vnmsac_vv_nxv16i8_unmasked(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.vp.mul.nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -305,8 +353,11 @@ define <16 x i8> @vnmsac_vv_nxv16i8_unmasked(<16 x i8> %a, <16 x i8> %b, <16 x i define <16 x i8> @vnmsac_vx_nxv16i8(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 @@ -320,8 +371,11 @@ define <16 x i8> @vnmsac_vx_nxv16i8(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1 define <16 x i8> @vnmsac_vx_nxv16i8_unmasked(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 @@ -335,9 +389,10 @@ define <16 x i8> @vnmsac_vx_nxv16i8_unmasked(<16 x i8> %a, i8 %b, <16 x i8> %c, define <16 x i8> @vnmsac_vv_nxv16i8_ta(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <16 x i8> @llvm.vp.mul.nxv16i8(<16 x i8> %a, <16 x i8> %b, <16 x i1> splat (i1 -1), i32 %evl) %y = call <16 x i8> @llvm.vp.sub.nxv16i8(<16 x i8> %c, <16 x i8> %x, <16 x i1> splat (i1 -1), i32 %evl) @@ -348,9 +403,10 @@ define <16 x i8> @vnmsac_vv_nxv16i8_ta(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, define <16 x i8> @vnmsac_vx_nxv16i8_ta(<16 x i8> %a, i8 %b, <16 x i8> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0 %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer @@ -368,8 +424,11 @@ declare <32 x i8> @llvm.vp.select.nxv32i8(<32 x i1>, <32 x i8>, <32 x i8>, i32) define <32 x i8> @vnmsac_vv_nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <32 x i8> @llvm.vp.mul.nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -381,8 +440,11 @@ define <32 x i8> @vnmsac_vv_nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, < define <32 x i8> @vnmsac_vv_nxv32i8_unmasked(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv32i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, ma -; CHECK-NEXT: vnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <32 x i8> @llvm.vp.mul.nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -394,8 +456,11 @@ define <32 x i8> @vnmsac_vv_nxv32i8_unmasked(<32 x i8> %a, <32 x i8> %b, <32 x i define <32 x i8> @vnmsac_vx_nxv32i8(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv32i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, mu -; CHECK-NEXT: vnmsac.vx v10, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i8> poison, i8 %b, i32 0 @@ -409,8 +474,11 @@ define <32 x i8> @vnmsac_vx_nxv32i8(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1 define <32 x i8> @vnmsac_vx_nxv32i8_unmasked(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv32i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m2, tu, ma -; CHECK-NEXT: vnmsac.vx v10, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i8> poison, i8 %b, i32 0 @@ -424,9 +492,10 @@ define <32 x i8> @vnmsac_vx_nxv32i8_unmasked(<32 x i8> %a, i8 %b, <32 x i8> %c, define <32 x i8> @vnmsac_vv_nxv32i8_ta(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv32i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <32 x i8> @llvm.vp.mul.nxv32i8(<32 x i8> %a, <32 x i8> %b, <32 x i1> splat (i1 -1), i32 %evl) %y = call <32 x i8> @llvm.vp.sub.nxv32i8(<32 x i8> %c, <32 x i8> %x, <32 x i1> splat (i1 -1), i32 %evl) @@ -437,9 +506,10 @@ define <32 x i8> @vnmsac_vv_nxv32i8_ta(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, define <32 x i8> @vnmsac_vx_nxv32i8_ta(<32 x i8> %a, i8 %b, <32 x i8> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv32i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, mu -; CHECK-NEXT: vnmsac.vx v10, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i8> poison, i8 %b, i32 0 %vb = shufflevector <32 x i8> %elt.head, <32 x i8> poison, <32 x i32> zeroinitializer @@ -457,8 +527,11 @@ declare <64 x i8> @llvm.vp.select.nxv64i8(<64 x i1>, <64 x i8>, <64 x i8>, i32) define <64 x i8> @vnmsac_vv_nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <64 x i8> @llvm.vp.mul.nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i1> splat (i1 -1), i32 %evl) @@ -470,8 +543,11 @@ define <64 x i8> @vnmsac_vv_nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, < define <64 x i8> @vnmsac_vv_nxv64i8_unmasked(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv64i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, ma -; CHECK-NEXT: vnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <64 x i8> @llvm.vp.mul.nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i1> splat (i1 -1), i32 %evl) @@ -483,8 +559,11 @@ define <64 x i8> @vnmsac_vv_nxv64i8_unmasked(<64 x i8> %a, <64 x i8> %b, <64 x i define <64 x i8> @vnmsac_vx_nxv64i8(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv64i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, mu -; CHECK-NEXT: vnmsac.vx v12, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i8> poison, i8 %b, i32 0 @@ -498,8 +577,11 @@ define <64 x i8> @vnmsac_vx_nxv64i8(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1 define <64 x i8> @vnmsac_vx_nxv64i8_unmasked(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv64i8_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m4, tu, ma -; CHECK-NEXT: vnmsac.vx v12, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i8> poison, i8 %b, i32 0 @@ -513,9 +595,10 @@ define <64 x i8> @vnmsac_vx_nxv64i8_unmasked(<64 x i8> %a, i8 %b, <64 x i8> %c, define <64 x i8> @vnmsac_vv_nxv64i8_ta(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv64i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <64 x i8> @llvm.vp.mul.nxv64i8(<64 x i8> %a, <64 x i8> %b, <64 x i1> splat (i1 -1), i32 %evl) %y = call <64 x i8> @llvm.vp.sub.nxv64i8(<64 x i8> %c, <64 x i8> %x, <64 x i1> splat (i1 -1), i32 %evl) @@ -526,9 +609,10 @@ define <64 x i8> @vnmsac_vv_nxv64i8_ta(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, define <64 x i8> @vnmsac_vx_nxv64i8_ta(<64 x i8> %a, i8 %b, <64 x i8> %c, <64 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv64i8_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, mu -; CHECK-NEXT: vnmsac.vx v12, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <64 x i8> poison, i8 %b, i32 0 %vb = shufflevector <64 x i8> %elt.head, <64 x i8> poison, <64 x i32> zeroinitializer @@ -546,8 +630,11 @@ declare <2 x i16> @llvm.vp.select.nxv2i16(<2 x i1>, <2 x i16>, <2 x i16>, i32) define <2 x i16> @vnmsac_vv_nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i16> @llvm.vp.mul.nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -559,8 +646,11 @@ define <2 x i16> @vnmsac_vv_nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, < define <2 x i16> @vnmsac_vv_nxv2i16_unmasked(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i16> @llvm.vp.mul.nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -572,8 +662,11 @@ define <2 x i16> @vnmsac_vv_nxv2i16_unmasked(<2 x i16> %a, <2 x i16> %b, <2 x i1 define <2 x i16> @vnmsac_vx_nxv2i16(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 @@ -587,8 +680,11 @@ define <2 x i16> @vnmsac_vx_nxv2i16(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1 define <2 x i16> @vnmsac_vx_nxv2i16_unmasked(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 @@ -602,9 +698,10 @@ define <2 x i16> @vnmsac_vx_nxv2i16_unmasked(<2 x i16> %a, i16 %b, <2 x i16> %c, define <2 x i16> @vnmsac_vv_nxv2i16_ta(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i16> @llvm.vp.mul.nxv2i16(<2 x i16> %a, <2 x i16> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i16> @llvm.vp.sub.nxv2i16(<2 x i16> %c, <2 x i16> %x, <2 x i1> splat (i1 -1), i32 %evl) @@ -615,9 +712,10 @@ define <2 x i16> @vnmsac_vv_nxv2i16_ta(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, define <2 x i16> @vnmsac_vx_nxv2i16_ta(<2 x i16> %a, i16 %b, <2 x i16> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0 %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer @@ -635,8 +733,11 @@ declare <4 x i16> @llvm.vp.select.nxv4i16(<4 x i1>, <4 x i16>, <4 x i16>, i32) define <4 x i16> @vnmsac_vv_nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i16> @llvm.vp.mul.nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -648,8 +749,11 @@ define <4 x i16> @vnmsac_vv_nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, < define <4 x i16> @vnmsac_vv_nxv4i16_unmasked(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i16> @llvm.vp.mul.nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -661,8 +765,11 @@ define <4 x i16> @vnmsac_vv_nxv4i16_unmasked(<4 x i16> %a, <4 x i16> %b, <4 x i1 define <4 x i16> @vnmsac_vx_nxv4i16(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 @@ -676,8 +783,11 @@ define <4 x i16> @vnmsac_vx_nxv4i16(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1 define <4 x i16> @vnmsac_vx_nxv4i16_unmasked(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 @@ -691,9 +801,10 @@ define <4 x i16> @vnmsac_vx_nxv4i16_unmasked(<4 x i16> %a, i16 %b, <4 x i16> %c, define <4 x i16> @vnmsac_vv_nxv4i16_ta(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i16> @llvm.vp.mul.nxv4i16(<4 x i16> %a, <4 x i16> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i16> @llvm.vp.sub.nxv4i16(<4 x i16> %c, <4 x i16> %x, <4 x i1> splat (i1 -1), i32 %evl) @@ -704,9 +815,10 @@ define <4 x i16> @vnmsac_vv_nxv4i16_ta(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, define <4 x i16> @vnmsac_vx_nxv4i16_ta(<4 x i16> %a, i16 %b, <4 x i16> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0 %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer @@ -724,8 +836,11 @@ declare <8 x i16> @llvm.vp.select.nxv8i16(<8 x i1>, <8 x i16>, <8 x i16>, i32) define <8 x i16> @vnmsac_vv_nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.vp.mul.nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -737,8 +852,11 @@ define <8 x i16> @vnmsac_vv_nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, < define <8 x i16> @vnmsac_vv_nxv8i16_unmasked(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.vp.mul.nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -750,8 +868,11 @@ define <8 x i16> @vnmsac_vv_nxv8i16_unmasked(<8 x i16> %a, <8 x i16> %b, <8 x i1 define <8 x i16> @vnmsac_vx_nxv8i16(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 @@ -765,8 +886,11 @@ define <8 x i16> @vnmsac_vx_nxv8i16(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1 define <8 x i16> @vnmsac_vx_nxv8i16_unmasked(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 @@ -780,9 +904,10 @@ define <8 x i16> @vnmsac_vx_nxv8i16_unmasked(<8 x i16> %a, i16 %b, <8 x i16> %c, define <8 x i16> @vnmsac_vv_nxv8i16_ta(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i16> @llvm.vp.mul.nxv8i16(<8 x i16> %a, <8 x i16> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i16> @llvm.vp.sub.nxv8i16(<8 x i16> %c, <8 x i16> %x, <8 x i1> splat (i1 -1), i32 %evl) @@ -793,9 +918,10 @@ define <8 x i16> @vnmsac_vv_nxv8i16_ta(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, define <8 x i16> @vnmsac_vx_nxv8i16_ta(<8 x i16> %a, i16 %b, <8 x i16> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0 %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer @@ -813,8 +939,11 @@ declare <16 x i16> @llvm.vp.select.nxv16i16(<16 x i1>, <16 x i16>, <16 x i16>, i define <16 x i16> @vnmsac_vv_nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.vp.mul.nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -826,8 +955,11 @@ define <16 x i16> @vnmsac_vv_nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> % define <16 x i16> @vnmsac_vv_nxv16i16_unmasked(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, tu, ma -; CHECK-NEXT: vnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.vp.mul.nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -839,8 +971,11 @@ define <16 x i16> @vnmsac_vv_nxv16i16_unmasked(<16 x i16> %a, <16 x i16> %b, <16 define <16 x i16> @vnmsac_vx_nxv16i16(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m2, tu, mu -; CHECK-NEXT: vnmsac.vx v10, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 @@ -854,8 +989,11 @@ define <16 x i16> @vnmsac_vx_nxv16i16(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 define <16 x i16> @vnmsac_vx_nxv16i16_unmasked(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m2, tu, ma -; CHECK-NEXT: vnmsac.vx v10, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 @@ -869,9 +1007,10 @@ define <16 x i16> @vnmsac_vx_nxv16i16_unmasked(<16 x i16> %a, i16 %b, <16 x i16> define <16 x i16> @vnmsac_vv_nxv16i16_ta(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <16 x i16> @llvm.vp.mul.nxv16i16(<16 x i16> %a, <16 x i16> %b, <16 x i1> splat (i1 -1), i32 %evl) %y = call <16 x i16> @llvm.vp.sub.nxv16i16(<16 x i16> %c, <16 x i16> %x, <16 x i1> splat (i1 -1), i32 %evl) @@ -882,9 +1021,10 @@ define <16 x i16> @vnmsac_vv_nxv16i16_ta(<16 x i16> %a, <16 x i16> %b, <16 x i16 define <16 x i16> @vnmsac_vx_nxv16i16_ta(<16 x i16> %a, i16 %b, <16 x i16> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, mu -; CHECK-NEXT: vnmsac.vx v10, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0 %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer @@ -902,8 +1042,11 @@ declare <32 x i16> @llvm.vp.select.nxv32i16(<32 x i1>, <32 x i16>, <32 x i16>, i define <32 x i16> @vnmsac_vv_nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <32 x i16> @llvm.vp.mul.nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -915,8 +1058,11 @@ define <32 x i16> @vnmsac_vv_nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i16> % define <32 x i16> @vnmsac_vv_nxv32i16_unmasked(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv32i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, tu, ma -; CHECK-NEXT: vnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <32 x i16> @llvm.vp.mul.nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i1> splat (i1 -1), i32 %evl) @@ -928,8 +1074,11 @@ define <32 x i16> @vnmsac_vv_nxv32i16_unmasked(<32 x i16> %a, <32 x i16> %b, <32 define <32 x i16> @vnmsac_vx_nxv32i16(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv32i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, mu -; CHECK-NEXT: vnmsac.vx v12, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i16> poison, i16 %b, i32 0 @@ -943,8 +1092,11 @@ define <32 x i16> @vnmsac_vx_nxv32i16(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 define <32 x i16> @vnmsac_vx_nxv32i16_unmasked(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv32i16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m4, tu, ma -; CHECK-NEXT: vnmsac.vx v12, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i16> poison, i16 %b, i32 0 @@ -958,9 +1110,10 @@ define <32 x i16> @vnmsac_vx_nxv32i16_unmasked(<32 x i16> %a, i16 %b, <32 x i16> define <32 x i16> @vnmsac_vv_nxv32i16_ta(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv32i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <32 x i16> @llvm.vp.mul.nxv32i16(<32 x i16> %a, <32 x i16> %b, <32 x i1> splat (i1 -1), i32 %evl) %y = call <32 x i16> @llvm.vp.sub.nxv32i16(<32 x i16> %c, <32 x i16> %x, <32 x i1> splat (i1 -1), i32 %evl) @@ -971,9 +1124,10 @@ define <32 x i16> @vnmsac_vv_nxv32i16_ta(<32 x i16> %a, <32 x i16> %b, <32 x i16 define <32 x i16> @vnmsac_vx_nxv32i16_ta(<32 x i16> %a, i16 %b, <32 x i16> %c, <32 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv32i16_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, mu -; CHECK-NEXT: vnmsac.vx v12, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <32 x i16> poison, i16 %b, i32 0 %vb = shufflevector <32 x i16> %elt.head, <32 x i16> poison, <32 x i32> zeroinitializer @@ -991,8 +1145,11 @@ declare <2 x i32> @llvm.vp.select.nxv2i32(<2 x i1>, <2 x i32>, <2 x i32>, i32) define <2 x i32> @vnmsac_vv_nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i32> @llvm.vp.mul.nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1004,8 +1161,11 @@ define <2 x i32> @vnmsac_vv_nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, < define <2 x i32> @vnmsac_vv_nxv2i32_unmasked(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i32> @llvm.vp.mul.nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1017,8 +1177,11 @@ define <2 x i32> @vnmsac_vv_nxv2i32_unmasked(<2 x i32> %a, <2 x i32> %b, <2 x i3 define <2 x i32> @vnmsac_vx_nxv2i32(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 @@ -1032,8 +1195,11 @@ define <2 x i32> @vnmsac_vx_nxv2i32(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1 define <2 x i32> @vnmsac_vx_nxv2i32_unmasked(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 @@ -1047,9 +1213,10 @@ define <2 x i32> @vnmsac_vx_nxv2i32_unmasked(<2 x i32> %a, i32 %b, <2 x i32> %c, define <2 x i32> @vnmsac_vv_nxv2i32_ta(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i32> @llvm.vp.mul.nxv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i32> @llvm.vp.sub.nxv2i32(<2 x i32> %c, <2 x i32> %x, <2 x i1> splat (i1 -1), i32 %evl) @@ -1060,9 +1227,10 @@ define <2 x i32> @vnmsac_vv_nxv2i32_ta(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, define <2 x i32> @vnmsac_vx_nxv2i32_ta(<2 x i32> %a, i32 %b, <2 x i32> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv2i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0 %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer @@ -1080,8 +1248,11 @@ declare <4 x i32> @llvm.vp.select.nxv4i32(<4 x i1>, <4 x i32>, <4 x i32>, i32) define <4 x i32> @vnmsac_vv_nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i32> @llvm.vp.mul.nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1093,8 +1264,11 @@ define <4 x i32> @vnmsac_vv_nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, < define <4 x i32> @vnmsac_vv_nxv4i32_unmasked(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <4 x i32> @llvm.vp.mul.nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1106,8 +1280,11 @@ define <4 x i32> @vnmsac_vv_nxv4i32_unmasked(<4 x i32> %a, <4 x i32> %b, <4 x i3 define <4 x i32> @vnmsac_vx_nxv4i32(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v9, v9, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 @@ -1121,8 +1298,11 @@ define <4 x i32> @vnmsac_vx_nxv4i32(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1 define <4 x i32> @vnmsac_vx_nxv4i32_unmasked(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma -; CHECK-NEXT: vnmsac.vx v9, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v9, v8 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 @@ -1136,9 +1316,10 @@ define <4 x i32> @vnmsac_vx_nxv4i32_unmasked(<4 x i32> %a, i32 %b, <4 x i32> %c, define <4 x i32> @vnmsac_vv_nxv4i32_ta(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i32> @llvm.vp.mul.nxv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i32> @llvm.vp.sub.nxv4i32(<4 x i32> %c, <4 x i32> %x, <4 x i1> splat (i1 -1), i32 %evl) @@ -1149,9 +1330,10 @@ define <4 x i32> @vnmsac_vv_nxv4i32_ta(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, define <4 x i32> @vnmsac_vx_nxv4i32_ta(<4 x i32> %a, i32 %b, <4 x i32> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv4i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, mu -; CHECK-NEXT: vnmsac.vx v9, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v9, v8 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0 %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer @@ -1169,8 +1351,11 @@ declare <8 x i32> @llvm.vp.select.nxv8i32(<8 x i1>, <8 x i32>, <8 x i32>, i32) define <8 x i32> @vnmsac_vv_nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <8 x i32> @llvm.vp.mul.nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1182,8 +1367,11 @@ define <8 x i32> @vnmsac_vv_nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, < define <8 x i32> @vnmsac_vv_nxv8i32_unmasked(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <8 x i32> @llvm.vp.mul.nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1195,8 +1383,11 @@ define <8 x i32> @vnmsac_vv_nxv8i32_unmasked(<8 x i32> %a, <8 x i32> %b, <8 x i3 define <8 x i32> @vnmsac_vx_nxv8i32(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu -; CHECK-NEXT: vnmsac.vx v10, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 @@ -1210,8 +1401,11 @@ define <8 x i32> @vnmsac_vx_nxv8i32(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1 define <8 x i32> @vnmsac_vx_nxv8i32_unmasked(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vnmsac.vx v10, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv2r.v v8, v10 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 @@ -1225,9 +1419,10 @@ define <8 x i32> @vnmsac_vx_nxv8i32_unmasked(<8 x i32> %a, i32 %b, <8 x i32> %c, define <8 x i32> @vnmsac_vv_nxv8i32_ta(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i32> @llvm.vp.mul.nxv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i32> @llvm.vp.sub.nxv8i32(<8 x i32> %c, <8 x i32> %x, <8 x i1> splat (i1 -1), i32 %evl) @@ -1238,9 +1433,10 @@ define <8 x i32> @vnmsac_vv_nxv8i32_ta(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, define <8 x i32> @vnmsac_vx_nxv8i32_ta(<8 x i32> %a, i32 %b, <8 x i32> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv8i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, mu -; CHECK-NEXT: vnmsac.vx v10, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0 %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer @@ -1258,8 +1454,11 @@ declare <16 x i32> @llvm.vp.select.nxv16i32(<16 x i1>, <16 x i32>, <16 x i32>, i define <16 x i32> @vnmsac_vv_nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <16 x i32> @llvm.vp.mul.nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -1271,8 +1470,11 @@ define <16 x i32> @vnmsac_vv_nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i32> % define <16 x i32> @vnmsac_vv_nxv16i32_unmasked(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, tu, ma -; CHECK-NEXT: vnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <16 x i32> @llvm.vp.mul.nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i1> splat (i1 -1), i32 %evl) @@ -1284,8 +1486,11 @@ define <16 x i32> @vnmsac_vv_nxv16i32_unmasked(<16 x i32> %a, <16 x i32> %b, <16 define <16 x i32> @vnmsac_vx_nxv16i32(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, mu -; CHECK-NEXT: vnmsac.vx v12, a0, v8, v0.t +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 @@ -1299,8 +1504,11 @@ define <16 x i32> @vnmsac_vx_nxv16i32(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 define <16 x i32> @vnmsac_vx_nxv16i32_unmasked(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i32_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, tu, ma -; CHECK-NEXT: vnmsac.vx v12, a0, v8 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv4r.v v8, v12 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 @@ -1314,9 +1522,10 @@ define <16 x i32> @vnmsac_vx_nxv16i32_unmasked(<16 x i32> %a, i32 %b, <16 x i32> define <16 x i32> @vnmsac_vv_nxv16i32_ta(<16 x i32> %a, <16 x i32> %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv16i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <16 x i32> @llvm.vp.mul.nxv16i32(<16 x i32> %a, <16 x i32> %b, <16 x i1> splat (i1 -1), i32 %evl) %y = call <16 x i32> @llvm.vp.sub.nxv16i32(<16 x i32> %c, <16 x i32> %x, <16 x i1> splat (i1 -1), i32 %evl) @@ -1327,9 +1536,10 @@ define <16 x i32> @vnmsac_vv_nxv16i32_ta(<16 x i32> %a, <16 x i32> %b, <16 x i32 define <16 x i32> @vnmsac_vx_nxv16i32_ta(<16 x i32> %a, i32 %b, <16 x i32> %c, <16 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vx_nxv16i32_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, mu -; CHECK-NEXT: vnmsac.vx v12, a0, v8, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v8, a0 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0 %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer @@ -1347,8 +1557,11 @@ declare <2 x i64> @llvm.vp.select.nxv2i64(<2 x i1>, <2 x i64>, <2 x i64>, i32) define <2 x i64> @vnmsac_vv_nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmerge.vvm v10, v10, v8, v0 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i64> @llvm.vp.mul.nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1360,8 +1573,11 @@ define <2 x i64> @vnmsac_vv_nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, < define <2 x i64> @vnmsac_vv_nxv2i64_unmasked(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, tu, ma -; CHECK-NEXT: vnmsac.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; CHECK-NEXT: vmv.v.v v10, v8 ; CHECK-NEXT: vmv1r.v v8, v10 ; CHECK-NEXT: ret %x = call <2 x i64> @llvm.vp.mul.nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i1> splat (i1 -1), i32 %evl) @@ -1380,8 +1596,11 @@ define <2 x i64> @vnmsac_vx_nxv2i64(<2 x i64> %a, i64 %b, <2 x i64> %c, <2 x i1 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, mu -; RV32-NEXT: vnmsac.vv v9, v8, v10, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v9, v8 +; RV32-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV32-NEXT: vmerge.vvm v9, v9, v8, v0 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1389,8 +1608,11 @@ define <2 x i64> @vnmsac_vx_nxv2i64(<2 x i64> %a, i64 %b, <2 x i64> %c, <2 x i1 ; ; RV64-LABEL: vnmsac_vx_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, mu -; RV64-NEXT: vnmsac.vx v9, a0, v8, v0.t +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v9, v8 +; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV64-NEXT: vmerge.vvm v9, v9, v8, v0 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 @@ -1411,8 +1633,11 @@ define <2 x i64> @vnmsac_vx_nxv2i64_unmasked(<2 x i64> %a, i64 %b, <2 x i64> %c, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, tu, ma -; RV32-NEXT: vnmsac.vv v9, v8, v10 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v9, v8 +; RV32-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV32-NEXT: vmv.v.v v9, v8 ; RV32-NEXT: vmv1r.v v8, v9 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1420,8 +1645,11 @@ define <2 x i64> @vnmsac_vx_nxv2i64_unmasked(<2 x i64> %a, i64 %b, <2 x i64> %c, ; ; RV64-LABEL: vnmsac_vx_nxv2i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, tu, ma -; RV64-NEXT: vnmsac.vx v9, a0, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v9, v8 +; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma +; RV64-NEXT: vmv.v.v v9, v8 ; RV64-NEXT: vmv1r.v v8, v9 ; RV64-NEXT: ret %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 @@ -1435,9 +1663,10 @@ define <2 x i64> @vnmsac_vx_nxv2i64_unmasked(<2 x i64> %a, i64 %b, <2 x i64> %c, define <2 x i64> @vnmsac_vv_nxv2i64_ta(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv2i64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: vnmsac.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: ret %x = call <2 x i64> @llvm.vp.mul.nxv2i64(<2 x i64> %a, <2 x i64> %b, <2 x i1> splat (i1 -1), i32 %evl) %y = call <2 x i64> @llvm.vp.sub.nxv2i64(<2 x i64> %c, <2 x i64> %x, <2 x i1> splat (i1 -1), i32 %evl) @@ -1455,18 +1684,20 @@ define <2 x i64> @vnmsac_vx_nxv2i64_ta(<2 x i64> %a, i64 %b, <2 x i64> %c, <2 x ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu -; RV32-NEXT: vnmsac.vv v9, v8, v10, v0.t -; RV32-NEXT: vmv.v.v v8, v9 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v10 +; RV32-NEXT: vsub.vv v8, v9, v8 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vnmsac_vx_nxv2i64_ta: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, mu -; RV64-NEXT: vnmsac.vx v9, a0, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v9 +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v9, v8 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV64-NEXT: ret %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0 %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer @@ -1484,8 +1715,11 @@ declare <4 x i64> @llvm.vp.select.nxv4i64(<4 x i1>, <4 x i64>, <4 x i64>, i32) define <4 x i64> @vnmsac_vv_nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmerge.vvm v12, v12, v8, v0 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <4 x i64> @llvm.vp.mul.nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1497,8 +1731,11 @@ define <4 x i64> @vnmsac_vv_nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, < define <4 x i64> @vnmsac_vv_nxv4i64_unmasked(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, tu, ma -; CHECK-NEXT: vnmsac.vv v12, v8, v10 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; CHECK-NEXT: vmv.v.v v12, v8 ; CHECK-NEXT: vmv2r.v v8, v12 ; CHECK-NEXT: ret %x = call <4 x i64> @llvm.vp.mul.nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i1> splat (i1 -1), i32 %evl) @@ -1517,8 +1754,11 @@ define <4 x i64> @vnmsac_vx_nxv4i64(<4 x i64> %a, i64 %b, <4 x i64> %c, <4 x i1 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, mu -; RV32-NEXT: vnmsac.vv v10, v8, v12, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV32-NEXT: vmerge.vvm v10, v10, v8, v0 ; RV32-NEXT: vmv2r.v v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1526,8 +1766,11 @@ define <4 x i64> @vnmsac_vx_nxv4i64(<4 x i64> %a, i64 %b, <4 x i64> %c, <4 x i1 ; ; RV64-LABEL: vnmsac_vx_nxv4i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, mu -; RV64-NEXT: vnmsac.vx v10, a0, v8, v0.t +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV64-NEXT: vmerge.vvm v10, v10, v8, v0 ; RV64-NEXT: vmv2r.v v8, v10 ; RV64-NEXT: ret %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 @@ -1548,8 +1791,11 @@ define <4 x i64> @vnmsac_vx_nxv4i64_unmasked(<4 x i64> %a, i64 %b, <4 x i64> %c, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m2, tu, ma -; RV32-NEXT: vnmsac.vv v10, v8, v12 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV32-NEXT: vmv.v.v v10, v8 ; RV32-NEXT: vmv2r.v v8, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1557,8 +1803,11 @@ define <4 x i64> @vnmsac_vx_nxv4i64_unmasked(<4 x i64> %a, i64 %b, <4 x i64> %c, ; ; RV64-LABEL: vnmsac_vx_nxv4i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m2, tu, ma -; RV64-NEXT: vnmsac.vx v10, a0, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma +; RV64-NEXT: vmv.v.v v10, v8 ; RV64-NEXT: vmv2r.v v8, v10 ; RV64-NEXT: ret %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 @@ -1572,9 +1821,10 @@ define <4 x i64> @vnmsac_vx_nxv4i64_unmasked(<4 x i64> %a, i64 %b, <4 x i64> %c, define <4 x i64> @vnmsac_vv_nxv4i64_ta(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv4i64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, mu -; CHECK-NEXT: vnmsac.vv v12, v8, v10, v0.t -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vsub.vv v8, v12, v8 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: ret %x = call <4 x i64> @llvm.vp.mul.nxv4i64(<4 x i64> %a, <4 x i64> %b, <4 x i1> splat (i1 -1), i32 %evl) %y = call <4 x i64> @llvm.vp.sub.nxv4i64(<4 x i64> %c, <4 x i64> %x, <4 x i1> splat (i1 -1), i32 %evl) @@ -1592,18 +1842,20 @@ define <4 x i64> @vnmsac_vx_nxv4i64_ta(<4 x i64> %a, i64 %b, <4 x i64> %c, <4 x ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu -; RV32-NEXT: vnmsac.vv v10, v8, v12, v0.t -; RV32-NEXT: vmv.v.v v8, v10 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v12 +; RV32-NEXT: vsub.vv v8, v10, v8 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vnmsac_vx_nxv4i64_ta: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, mu -; RV64-NEXT: vnmsac.vx v10, a0, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v10 +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v10, v8 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: ret %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0 %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer @@ -1621,8 +1873,11 @@ declare <8 x i64> @llvm.vp.select.nxv8i64(<8 x i1>, <8 x i64>, <8 x i64>, i32) define <8 x i64> @vnmsac_vv_nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <8 x i64> @llvm.vp.mul.nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1634,8 +1889,11 @@ define <8 x i64> @vnmsac_vv_nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, < define <8 x i64> @vnmsac_vv_nxv8i64_unmasked(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i64_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, tu, ma -; CHECK-NEXT: vnmsac.vv v16, v8, v12 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; CHECK-NEXT: vmv.v.v v16, v8 ; CHECK-NEXT: vmv4r.v v8, v16 ; CHECK-NEXT: ret %x = call <8 x i64> @llvm.vp.mul.nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i1> splat (i1 -1), i32 %evl) @@ -1654,8 +1912,11 @@ define <8 x i64> @vnmsac_vx_nxv8i64(<8 x i64> %a, i64 %b, <8 x i64> %c, <8 x i1 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, mu -; RV32-NEXT: vnmsac.vv v12, v8, v16, v0.t +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsub.vv v8, v12, v8 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV32-NEXT: vmerge.vvm v12, v12, v8, v0 ; RV32-NEXT: vmv4r.v v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1663,8 +1924,11 @@ define <8 x i64> @vnmsac_vx_nxv8i64(<8 x i64> %a, i64 %b, <8 x i64> %c, <8 x i1 ; ; RV64-LABEL: vnmsac_vx_nxv8i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, mu -; RV64-NEXT: vnmsac.vx v12, a0, v8, v0.t +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v12, v8 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV64-NEXT: vmerge.vvm v12, v12, v8, v0 ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 @@ -1685,8 +1949,11 @@ define <8 x i64> @vnmsac_vx_nxv8i64_unmasked(<8 x i64> %a, i64 %b, <8 x i64> %c, ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, tu, ma -; RV32-NEXT: vnmsac.vv v12, v8, v16 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsub.vv v8, v12, v8 +; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV32-NEXT: vmv.v.v v12, v8 ; RV32-NEXT: vmv4r.v v8, v12 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 @@ -1694,8 +1961,11 @@ define <8 x i64> @vnmsac_vx_nxv8i64_unmasked(<8 x i64> %a, i64 %b, <8 x i64> %c, ; ; RV64-LABEL: vnmsac_vx_nxv8i64_unmasked: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, tu, ma -; RV64-NEXT: vnmsac.vx v12, a0, v8 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v12, v8 +; RV64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RV64-NEXT: vmv.v.v v12, v8 ; RV64-NEXT: vmv4r.v v8, v12 ; RV64-NEXT: ret %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 @@ -1709,9 +1979,10 @@ define <8 x i64> @vnmsac_vx_nxv8i64_unmasked(<8 x i64> %a, i64 %b, <8 x i64> %c, define <8 x i64> @vnmsac_vv_nxv8i64_ta(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vnmsac_vv_nxv8i64_ta: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, mu -; CHECK-NEXT: vnmsac.vv v16, v8, v12, v0.t -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; CHECK-NEXT: vmul.vv v8, v8, v12 +; CHECK-NEXT: vsub.vv v8, v16, v8 +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: ret %x = call <8 x i64> @llvm.vp.mul.nxv8i64(<8 x i64> %a, <8 x i64> %b, <8 x i1> splat (i1 -1), i32 %evl) %y = call <8 x i64> @llvm.vp.sub.nxv8i64(<8 x i64> %c, <8 x i64> %x, <8 x i1> splat (i1 -1), i32 %evl) @@ -1729,18 +2000,20 @@ define <8 x i64> @vnmsac_vx_nxv8i64_ta(<8 x i64> %a, i64 %b, <8 x i64> %c, <8 x ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vlse64.v v16, (a0), zero -; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu -; RV32-NEXT: vnmsac.vv v12, v8, v16, v0.t -; RV32-NEXT: vmv.v.v v8, v12 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma +; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vsub.vv v8, v12, v8 +; RV32-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vnmsac_vx_nxv8i64_ta: ; RV64: # %bb.0: -; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, mu -; RV64-NEXT: vnmsac.vx v12, a0, v8, v0.t -; RV64-NEXT: vmv.v.v v8, v12 +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmul.vx v8, v8, a0 +; RV64-NEXT: vsub.vv v8, v12, v8 +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 ; RV64-NEXT: ret %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0 %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnsra-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnsra-vnsrl.ll index 678d2524c75d7..8f8ab0be53d8b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnsra-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vnsra-vnsrl.ll @@ -5,8 +5,10 @@ define <8 x i8> @vnsra_v8i16_v8i8_scalar(<8 x i16> %x, i16 %y) { ; CHECK-LABEL: vnsra_v8i16_v8i8_scalar: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsra.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <8 x i16> poison, i16 %y, i16 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer @@ -19,7 +21,12 @@ define <8 x i8> @vnsra_v8i16_v8i8_scalar_sext(<8 x i16> %x, i8 %y) { ; CHECK-LABEL: vnsra_v8i16_v8i8_scalar_sext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <8 x i8> poison, i8 %y, i8 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer @@ -33,7 +40,12 @@ define <8 x i8> @vnsra_v8i16_v8i8_scalar_zext(<8 x i16> %x, i8 %y) { ; CHECK-LABEL: vnsra_v8i16_v8i8_scalar_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <8 x i8> poison, i8 %y, i8 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer @@ -46,8 +58,10 @@ define <8 x i8> @vnsra_v8i16_v8i8_scalar_zext(<8 x i16> %x, i8 %y) { define <4 x i16> @vnsra_v4i32_v4i16_scalar(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: vnsra_v4i32_v4i16_scalar: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsra.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <4 x i32> poison, i32 %y, i32 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer @@ -60,7 +74,12 @@ define <4 x i16> @vnsra_v4i32_v4i16_scalar_sext(<4 x i32> %x, i16 %y) { ; CHECK-LABEL: vnsra_v4i32_v4i16_scalar_sext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <4 x i16> poison, i16 %y, i16 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer @@ -74,7 +93,12 @@ define <4 x i16> @vnsra_v4i32_v4i16_scalar_zext(<4 x i32> %x, i16 %y) { ; CHECK-LABEL: vnsra_v4i32_v4i16_scalar_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <4 x i16> poison, i16 %y, i16 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer @@ -87,8 +111,10 @@ define <4 x i16> @vnsra_v4i32_v4i16_scalar_zext(<4 x i32> %x, i16 %y) { define <2 x i32> @vnsra_v2i64_v2i32_scalar(<2 x i64> %x, i64 %y) { ; CHECK-LABEL: vnsra_v2i64_v2i32_scalar: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsra.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <2 x i64> poison, i64 %y, i32 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer @@ -101,7 +127,12 @@ define <2 x i32> @vnsra_v2i64_v2i32_scalar_sext(<2 x i64> %x, i32 %y) { ; CHECK-LABEL: vnsra_v2i64_v2i32_scalar_sext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <2 x i32> poison, i32 %y, i32 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer @@ -115,7 +146,12 @@ define <2 x i32> @vnsra_v2i64_v2i32_scalar_zext(<2 x i64> %x, i32 %y) { ; CHECK-LABEL: vnsra_v2i64_v2i32_scalar_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsra.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <2 x i32> poison, i32 %y, i32 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer @@ -128,8 +164,10 @@ define <2 x i32> @vnsra_v2i64_v2i32_scalar_zext(<2 x i64> %x, i32 %y) { define <8 x i8> @vnsra_v8i16_v8i8_imm(<8 x i16> %x) { ; CHECK-LABEL: vnsra_v8i16_v8i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = ashr <8 x i16> %x, %b = trunc <8 x i16> %a to <8 x i8> @@ -139,8 +177,10 @@ define <8 x i8> @vnsra_v8i16_v8i8_imm(<8 x i16> %x) { define <4 x i16> @vnsra_v4i32_v4i16_imm(<4 x i32> %x) { ; CHECK-LABEL: vnsra_v4i32_v4i16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 16 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = ashr <4 x i32> %x, %b = trunc <4 x i32> %a to <4 x i16> @@ -150,8 +190,10 @@ define <4 x i16> @vnsra_v4i32_v4i16_imm(<4 x i32> %x) { define <2 x i32> @vnsra_v2i64_v2i32_imm(<2 x i64> %x) { ; CHECK-LABEL: vnsra_v2i64_v2i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 31 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 31 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = ashr <2 x i64> %x, %b = trunc <2 x i64> %a to <2 x i32> @@ -161,8 +203,11 @@ define <2 x i32> @vnsra_v2i64_v2i32_imm(<2 x i64> %x) { define <8 x i8> @vnsra_v8i16_v8i8_sext(<8 x i16> %x, <8 x i8> %y) { ; CHECK-LABEL: vnsra_v8i16_v8i8_sext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsra.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %sext = sext <8 x i8> %y to <8 x i16> %a = ashr <8 x i16> %x, %sext @@ -173,8 +218,11 @@ define <8 x i8> @vnsra_v8i16_v8i8_sext(<8 x i16> %x, <8 x i8> %y) { define <8 x i8> @vnsra_v8i16_v8i8_zext(<8 x i16> %x, <8 x i8> %y) { ; CHECK-LABEL: vnsra_v8i16_v8i8_zext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsra.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsra.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %sext = zext <8 x i8> %y to <8 x i16> %a = ashr <8 x i16> %x, %sext @@ -185,8 +233,10 @@ define <8 x i8> @vnsra_v8i16_v8i8_zext(<8 x i16> %x, <8 x i8> %y) { define <8 x i8> @vnsrl_v8i16_v8i8_scalar(<8 x i16> %x, i16 %y) { ; CHECK-LABEL: vnsrl_v8i16_v8i8_scalar: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <8 x i16> poison, i16 %y, i16 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> poison, <8 x i32> zeroinitializer @@ -199,7 +249,12 @@ define <8 x i8> @vnsrl_v8i16_v8i8_scalar_sext(<8 x i16> %x, i8 %y) { ; CHECK-LABEL: vnsrl_v8i16_v8i8_scalar_sext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <8 x i8> poison, i8 %y, i16 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer @@ -213,7 +268,12 @@ define <8 x i8> @vnsrl_v8i16_v8i8_scalar_zext(<8 x i16> %x, i8 %y) { ; CHECK-LABEL: vnsrl_v8i16_v8i8_scalar_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <8 x i8> poison, i8 %y, i16 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> poison, <8 x i32> zeroinitializer @@ -226,8 +286,10 @@ define <8 x i8> @vnsrl_v8i16_v8i8_scalar_zext(<8 x i16> %x, i8 %y) { define <4 x i16> @vnsrl_v4i32_v4i16_scalar(<4 x i32> %x, i32 %y) { ; CHECK-LABEL: vnsrl_v4i32_v4i16_scalar: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <4 x i32> poison, i32 %y, i32 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer @@ -240,7 +302,12 @@ define <4 x i16> @vnsrl_v4i32_v4i16_scalar_sext(<4 x i32> %x, i16 %y) { ; CHECK-LABEL: vnsrl_v4i32_v4i16_scalar_sext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <4 x i16> poison, i16 %y, i16 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer @@ -254,7 +321,12 @@ define <4 x i16> @vnsrl_v4i32_v4i16_scalar_zext(<4 x i32> %x, i16 %y) { ; CHECK-LABEL: vnsrl_v4i32_v4i16_scalar_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <4 x i16> poison, i16 %y, i16 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> poison, <4 x i32> zeroinitializer @@ -267,8 +339,10 @@ define <4 x i16> @vnsrl_v4i32_v4i16_scalar_zext(<4 x i32> %x, i16 %y) { define <2 x i32> @vnsrl_v2i64_v2i32_scalar(<2 x i64> %x, i64 %y) { ; CHECK-LABEL: vnsrl_v2i64_v2i32_scalar: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsrl.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <2 x i64> poison, i64 %y, i32 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> poison, <2 x i32> zeroinitializer @@ -281,7 +355,12 @@ define <2 x i32> @vnsrl_v2i64_v2i32_scalar_sext(<2 x i64> %x, i32 %y) { ; CHECK-LABEL: vnsrl_v2i64_v2i32_scalar_sext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <2 x i32> poison, i32 %y, i32 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer @@ -295,7 +374,12 @@ define <2 x i32> @vnsrl_v2i64_v2i32_scalar_zext(<2 x i64> %x, i32 %y) { ; CHECK-LABEL: vnsrl_v2i64_v2i32_scalar_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vmv.v.x v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %insert = insertelement <2 x i32> poison, i32 %y, i32 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> poison, <2 x i32> zeroinitializer @@ -308,8 +392,10 @@ define <2 x i32> @vnsrl_v2i64_v2i32_scalar_zext(<2 x i64> %x, i32 %y) { define <8 x i8> @vnsrl_v8i16_v8i8_imm(<8 x i16> %x) { ; CHECK-LABEL: vnsrl_v8i16_v8i8_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 8 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 8 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = lshr <8 x i16> %x, %b = trunc <8 x i16> %a to <8 x i8> @@ -319,8 +405,10 @@ define <8 x i8> @vnsrl_v8i16_v8i8_imm(<8 x i16> %x) { define <4 x i16> @vnsrl_v4i32_v4i16_imm(<4 x i32> %x) { ; CHECK-LABEL: vnsrl_v4i32_v4i16_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 16 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = lshr <4 x i32> %x, %b = trunc <4 x i32> %a to <4 x i16> @@ -330,8 +418,10 @@ define <4 x i16> @vnsrl_v4i32_v4i16_imm(<4 x i32> %x) { define <2 x i32> @vnsrl_v2i64_v2i32_imm(<2 x i64> %x) { ; CHECK-LABEL: vnsrl_v2i64_v2i32_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 31 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsrl.vi v8, v8, 31 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = lshr <2 x i64> %x, %b = trunc <2 x i64> %a to <2 x i32> @@ -341,8 +431,11 @@ define <2 x i32> @vnsrl_v2i64_v2i32_imm(<2 x i64> %x) { define <4 x i16> @vnsrl_v4i32_v4i16_sext(<4 x i32> %x, <4 x i16> %y) { ; CHECK-LABEL: vnsrl_v4i32_v4i16_sext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %sext = sext <4 x i16> %y to <4 x i32> %a = lshr <4 x i32> %x, %sext @@ -353,8 +446,11 @@ define <4 x i16> @vnsrl_v4i32_v4i16_sext(<4 x i32> %x, <4 x i16> %y) { define <4 x i16> @vnsrl_v4i32_v4i16_zext(<4 x i32> %x, <4 x i16> %y) { ; CHECK-LABEL: vnsrl_v4i32_v4i16_zext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vnsrl.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %zext = zext <4 x i16> %y to <4 x i32> %a = lshr <4 x i32> %x, %zext @@ -365,8 +461,11 @@ define <4 x i16> @vnsrl_v4i32_v4i16_zext(<4 x i32> %x, <4 x i16> %y) { define <2 x i32> @vnsrl_v2i64_v2i32_sext(<2 x i64> %x, <2 x i32> %y) { ; CHECK-LABEL: vnsrl_v2i64_v2i32_sext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %sext = sext <2 x i32> %y to <2 x i64> %a = lshr <2 x i64> %x, %sext @@ -377,8 +476,11 @@ define <2 x i32> @vnsrl_v2i64_v2i32_sext(<2 x i64> %x, <2 x i32> %y) { define <2 x i32> @vnsrl_v2i64_v2i32_zext(<2 x i64> %x, <2 x i32> %y) { ; CHECK-LABEL: vnsrl_v2i64_v2i32_zext: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vsrl.vv v8, v8, v10 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %zext = zext <2 x i32> %y to <2 x i64> %a = lshr <2 x i64> %x, %zext diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll index 352666de57881..47f87a7ccbaff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll @@ -529,18 +529,20 @@ define <8 x i16> @vpgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 define <8 x i16> @vpgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v9, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vadd.vv v8, v9, v9 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV32-NEXT: vluxei16.v v8, (a0), v9, v0.t +; RV32-NEXT: vluxei16.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v9, v8, v8 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vv v8, v9, v9 ; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV64-NEXT: vluxei16.v v8, (a0), v9, v0.t +; RV64-NEXT: vluxei16.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %eidxs @@ -551,8 +553,9 @@ define <8 x i16> @vpgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 define <8 x i16> @vpgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret @@ -736,18 +739,18 @@ define <8 x i32> @vpgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 define <8 x i32> @vpgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vsll.vi v10, v9, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV32-NEXT: vluxei16.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v10, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vsll.vi v10, v9, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vluxei16.v v8, (a0), v10, v0.t ; RV64-NEXT: ret @@ -760,11 +763,11 @@ define <8 x i32> @vpgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 define <8 x i32> @vpgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v8, v10, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v8i16_v8i32: @@ -783,11 +786,11 @@ define <8 x i32> @vpgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x define <8 x i32> @vpgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v8, v10, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_sext_v8i16_v8i32: @@ -807,20 +810,20 @@ define <8 x i32> @vpgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, define <8 x i32> @vpgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v8, v10, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v10, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v10, v8 +; RV64-NEXT: vsll.vi v8, v10, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV64-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs @@ -839,9 +842,9 @@ define <8 x i32> @vpgather_baseidx_v8i32(ptr %base, <8 x i32> %idxs, <8 x i1> %m ; ; RV64-LABEL: vpgather_baseidx_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v12, v12, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -974,18 +977,18 @@ define <8 x i64> @vpgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 define <8 x i64> @vpgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vsll.vi v12, v9, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei16.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vsll.vi v12, v9, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV64-NEXT: vluxei16.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -998,9 +1001,9 @@ define <8 x i64> @vpgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 define <8 x i64> @vpgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8i16_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v12, v10, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret @@ -1021,9 +1024,9 @@ define <8 x i64> @vpgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x define <8 x i64> @vpgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v12, v10, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret @@ -1045,18 +1048,18 @@ define <8 x i64> @vpgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, define <8 x i64> @vpgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v12, v10, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v10, v8 +; RV64-NEXT: vsll.vi v12, v10, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV64-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -1077,11 +1080,11 @@ define <8 x i64> @vpgather_baseidx_v8i32_v8i64(ptr %base, <8 x i32> %idxs, <8 x ; ; RV64-LABEL: vpgather_baseidx_v8i32_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v8, v12, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds i64, ptr %base, <8 x i32> %idxs %v = call <8 x i64> @llvm.vp.gather.v8i64.v8p0(<8 x ptr> %ptrs, <8 x i1> %m, i32 %evl) @@ -1099,11 +1102,11 @@ define <8 x i64> @vpgather_baseidx_sext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, ; ; RV64-LABEL: vpgather_baseidx_sext_v8i32_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v8, v12, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs @@ -1122,11 +1125,11 @@ define <8 x i64> @vpgather_baseidx_zext_v8i32_v8i64(ptr %base, <8 x i32> %idxs, ; ; RV64-LABEL: vpgather_baseidx_zext_v8i32_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vzext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v8, v12, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs @@ -1284,18 +1287,20 @@ define <8 x bfloat> @vpgather_baseidx_sext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs define <8 x bfloat> @vpgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8bf16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v9, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vadd.vv v8, v9, v9 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV32-NEXT: vluxei16.v v8, (a0), v9, v0.t +; RV32-NEXT: vluxei16.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8bf16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v9, v8, v8 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vv v8, v9, v9 ; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV64-NEXT: vluxei16.v v8, (a0), v9, v0.t +; RV64-NEXT: vluxei16.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds bfloat, ptr %base, <8 x i16> %eidxs @@ -1306,8 +1311,9 @@ define <8 x bfloat> @vpgather_baseidx_zext_v8i8_v8bf16(ptr %base, <8 x i8> %idxs define <8 x bfloat> @vpgather_baseidx_v8bf16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8bf16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret @@ -1453,18 +1459,20 @@ define <8 x half> @vpgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, < define <8 x half> @vpgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v9, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vadd.vv v8, v9, v9 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV32-NEXT: vluxei16.v v8, (a0), v9, v0.t +; RV32-NEXT: vluxei16.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8f16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v9, v8, v8 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vv v8, v9, v9 ; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV64-NEXT: vluxei16.v v8, (a0), v9, v0.t +; RV64-NEXT: vluxei16.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, ptr %base, <8 x i16> %eidxs @@ -1475,8 +1483,9 @@ define <8 x half> @vpgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, < define <8 x half> @vpgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v8, v8 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret @@ -1618,18 +1627,18 @@ define <8 x float> @vpgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, define <8 x float> @vpgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vsll.vi v10, v9, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV32-NEXT: vluxei16.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v10, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vsll.vi v10, v9, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vluxei16.v v8, (a0), v10, v0.t ; RV64-NEXT: ret @@ -1642,11 +1651,11 @@ define <8 x float> @vpgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, define <8 x float> @vpgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v8, v10, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_v8i16_v8f32: @@ -1665,11 +1674,11 @@ define <8 x float> @vpgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 define <8 x float> @vpgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v8, v10, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_sext_v8i16_v8f32: @@ -1689,20 +1698,20 @@ define <8 x float> @vpgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs define <8 x float> @vpgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v10, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v8, v10, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV32-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v10, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v10, v8 +; RV64-NEXT: vsll.vi v8, v10, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vluxei32.v v8, (a0), v10, v0.t +; RV64-NEXT: vluxei32.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs @@ -1721,9 +1730,9 @@ define <8 x float> @vpgather_baseidx_v8f32(ptr %base, <8 x i32> %idxs, <8 x i1> ; ; RV64-LABEL: vpgather_baseidx_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v12, v12, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -1856,18 +1865,18 @@ define <8 x double> @vpgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, define <8 x double> @vpgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i8_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v9, v8 +; RV32-NEXT: vsll.vi v12, v9, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei16.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i8_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vsll.vi v12, v9, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV64-NEXT: vluxei16.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -1880,9 +1889,9 @@ define <8 x double> @vpgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs, define <8 x double> @vpgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_v8i16_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v12, v10, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret @@ -1903,9 +1912,9 @@ define <8 x double> @vpgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 define <8 x double> @vpgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_sext_v8i16_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v12, v10, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret @@ -1927,18 +1936,18 @@ define <8 x double> @vpgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idx define <8 x double> @vpgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpgather_baseidx_zext_v8i16_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v8, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsll.vi v12, v10, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpgather_baseidx_zext_v8i16_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v10, v8 +; RV64-NEXT: vsll.vi v12, v10, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma ; RV64-NEXT: vluxei32.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -1959,11 +1968,11 @@ define <8 x double> @vpgather_baseidx_v8i32_v8f64(ptr %base, <8 x i32> %idxs, <8 ; ; RV64-LABEL: vpgather_baseidx_v8i32_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v8, v12, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, ptr %base, <8 x i32> %idxs %v = call <8 x double> @llvm.vp.gather.v8f64.v8p0(<8 x ptr> %ptrs, <8 x i1> %m, i32 %evl) @@ -1981,11 +1990,11 @@ define <8 x double> @vpgather_baseidx_sext_v8i32_v8f64(ptr %base, <8 x i32> %idx ; ; RV64-LABEL: vpgather_baseidx_sext_v8i32_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v8, v12, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs @@ -2004,11 +2013,11 @@ define <8 x double> @vpgather_baseidx_zext_v8i32_v8f64(ptr %base, <8 x i32> %idx ; ; RV64-LABEL: vpgather_baseidx_zext_v8i32_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulu.vx v12, v8, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vzext.vf2 v12, v8 +; RV64-NEXT: vsll.vi v8, v12, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v12, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs @@ -2209,25 +2218,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV32-LABEL: vpgather_baseidx_zext_v32i8_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 -; RV32-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; RV32-NEXT: vwmulu.vx v16, v8, a3 +; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; RV32-NEXT: vzext.vf2 v12, v8 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v12, 3 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB97_2 +; RV32-NEXT: bltu a1, a3, .LBB97_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB97_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei16.v v8, (a0), v16, v0.t -; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a1, a1, a2 -; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: vsetivli zero, 16, e16, m4, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei16.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2235,25 +2244,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i8_v32f64(ptr %base, <32 x i8> %i ; RV64-LABEL: vpgather_baseidx_zext_v32i8_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: li a2, 32 -; RV64-NEXT: li a3, 8 -; RV64-NEXT: li a4, 16 -; RV64-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; RV64-NEXT: vwmulu.vx v16, v8, a3 +; RV64-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; RV64-NEXT: vzext.vf2 v12, v8 +; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsll.vi v16, v12, 3 ; RV64-NEXT: mv a2, a1 -; RV64-NEXT: bltu a1, a4, .LBB97_2 +; RV64-NEXT: bltu a1, a3, .LBB97_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB97_2: ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vluxei16.v v8, (a0), v16, v0.t -; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma -; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: addi a2, a1, -16 -; RV64-NEXT: sltu a1, a1, a2 -; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: sltu a1, a1, a2 +; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a1, a1, a2 +; RV64-NEXT: vsetivli zero, 16, e16, m4, ta, ma +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei16.v v16, (a0), v24, v0.t ; RV64-NEXT: ret @@ -2267,25 +2276,25 @@ define <32 x double> @vpgather_baseidx_v32i16_v32f64(ptr %base, <32 x i16> %idxs ; RV32-LABEL: vpgather_baseidx_v32i16_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vwmulsu.vx v16, v8, a3 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vsext.vf2 v16, v8 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB98_2 +; RV32-NEXT: bltu a1, a3, .LBB98_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB98_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a1, a1, a2 -; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2325,25 +2334,25 @@ define <32 x double> @vpgather_baseidx_sext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32-LABEL: vpgather_baseidx_sext_v32i16_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vwmulsu.vx v16, v8, a3 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vsext.vf2 v16, v8 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB99_2 +; RV32-NEXT: bltu a1, a3, .LBB99_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB99_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a1, a1, a2 -; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2385,25 +2394,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV32-LABEL: vpgather_baseidx_zext_v32i16_v32f64: ; RV32: # %bb.0: ; RV32-NEXT: li a2, 32 -; RV32-NEXT: li a3, 8 -; RV32-NEXT: li a4, 16 -; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vwmulu.vx v16, v8, a3 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vzext.vf2 v16, v8 +; RV32-NEXT: li a3, 16 +; RV32-NEXT: vsll.vi v16, v16, 3 ; RV32-NEXT: mv a2, a1 -; RV32-NEXT: bltu a1, a4, .LBB100_2 +; RV32-NEXT: bltu a1, a3, .LBB100_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB100_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: addi a2, a1, -16 -; RV32-NEXT: sltu a1, a1, a2 -; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV32-NEXT: vslidedown.vi v0, v0, 2 +; RV32-NEXT: sltu a1, a1, a2 +; RV32-NEXT: addi a1, a1, -1 ; RV32-NEXT: and a1, a1, a2 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v24, v16, 16 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV32-NEXT: ret @@ -2411,25 +2420,25 @@ define <32 x double> @vpgather_baseidx_zext_v32i16_v32f64(ptr %base, <32 x i16> ; RV64-LABEL: vpgather_baseidx_zext_v32i16_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: li a2, 32 -; RV64-NEXT: li a3, 8 -; RV64-NEXT: li a4, 16 -; RV64-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV64-NEXT: vwmulu.vx v16, v8, a3 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64-NEXT: vzext.vf2 v16, v8 +; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsll.vi v16, v16, 3 ; RV64-NEXT: mv a2, a1 -; RV64-NEXT: bltu a1, a4, .LBB100_2 +; RV64-NEXT: bltu a1, a3, .LBB100_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB100_2: ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV64-NEXT: vluxei32.v v8, (a0), v16, v0.t -; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: addi a2, a1, -16 -; RV64-NEXT: sltu a1, a1, a2 -; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; RV64-NEXT: vslidedown.vi v0, v0, 2 +; RV64-NEXT: sltu a1, a1, a2 +; RV64-NEXT: addi a1, a1, -1 ; RV64-NEXT: and a1, a1, a2 +; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV64-NEXT: vslidedown.vi v24, v16, 16 ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV64-NEXT: vluxei32.v v16, (a0), v24, v0.t ; RV64-NEXT: ret @@ -2468,19 +2477,20 @@ define <32 x double> @vpgather_baseidx_v32i32_v32f64(ptr %base, <32 x i32> %idxs ; RV64-LABEL: vpgather_baseidx_v32i32_v32f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: li a2, 8 +; RV64-NEXT: vslidedown.vi v16, v8, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v24, v8 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwmulsu.vx v16, v24, a2 -; RV64-NEXT: vwmulsu.vx v24, v8, a2 +; RV64-NEXT: vsext.vf2 v8, v16 +; RV64-NEXT: vsll.vi v16, v8, 3 +; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB101_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB101_2: ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 @@ -2523,20 +2533,22 @@ define <32 x double> @vpgather_baseidx_sext_v32i32_v32f64(ptr %base, <32 x i32> ; ; RV64-LABEL: vpgather_baseidx_sext_v32i32_v32f64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v24, v8 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: li a2, 8 +; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwmulsu.vx v16, v24, a2 -; RV64-NEXT: vwmulsu.vx v24, v8, a2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB102_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB102_2: ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 @@ -2580,20 +2592,22 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32> ; ; RV64-LABEL: vpgather_baseidx_zext_v32i32_v32f64: ; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v24, v8 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v24, v8, 16 -; RV64-NEXT: li a2, 8 +; RV64-NEXT: vslidedown.vi v8, v8, 16 ; RV64-NEXT: li a3, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwmulu.vx v16, v24, a2 -; RV64-NEXT: vwmulu.vx v24, v8, a2 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsll.vi v8, v24, 3 ; RV64-NEXT: mv a2, a1 ; RV64-NEXT: bltu a1, a3, .LBB103_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a2, 16 ; RV64-NEXT: .LBB103_2: ; RV64-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV64-NEXT: vluxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: vluxei64.v v8, (a0), v8, v0.t ; RV64-NEXT: addi a2, a1, -16 ; RV64-NEXT: sltu a1, a1, a2 ; RV64-NEXT: addi a1, a1, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll index 8e2e8f3fb0dec..6c9989775f790 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpload.ll @@ -402,29 +402,29 @@ define <33 x double> @vpload_v33f64(ptr %ptr, <33 x i1> %m, i32 zeroext %evl) { ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 32 ; CHECK-NEXT: .LBB32_2: -; CHECK-NEXT: addi a5, a3, -16 -; CHECK-NEXT: addi a4, a1, 128 -; CHECK-NEXT: addi a7, a2, -32 -; CHECK-NEXT: sltu a3, a3, a5 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a6, a3, a5 -; CHECK-NEXT: sltu a3, a2, a7 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a5, a3, a7 -; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: addi a4, a3, -16 ; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v8, 2 -; CHECK-NEXT: bltu a5, a3, .LBB32_4 +; CHECK-NEXT: sltu a3, a3, a4 +; CHECK-NEXT: addi a3, a3, -1 +; CHECK-NEXT: and a3, a3, a4 +; CHECK-NEXT: addi a4, a1, 128 +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v16, (a4), v0.t +; CHECK-NEXT: addi a3, a2, -32 +; CHECK-NEXT: sltu a4, a2, a3 +; CHECK-NEXT: addi a4, a4, -1 +; CHECK-NEXT: and a4, a4, a3 +; CHECK-NEXT: li a3, 16 +; CHECK-NEXT: bltu a4, a3, .LBB32_4 ; CHECK-NEXT: # %bb.3: -; CHECK-NEXT: li a5, 16 +; CHECK-NEXT: li a4, 16 ; CHECK-NEXT: .LBB32_4: -; CHECK-NEXT: vsetvli zero, a6, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a4), v0.t ; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v8, 4 -; CHECK-NEXT: addi a4, a1, 256 -; CHECK-NEXT: vsetvli zero, a5, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a4), v0.t +; CHECK-NEXT: addi a5, a1, 256 +; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma +; CHECK-NEXT: vle64.v v24, (a5), v0.t ; CHECK-NEXT: bltu a2, a3, .LBB32_6 ; CHECK-NEXT: # %bb.5: ; CHECK-NEXT: li a2, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll index 4299707c9a48c..a5abdfe9e68ee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll @@ -17,9 +17,10 @@ define <4 x i1> @vpmerge_vv_v4i1(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %m, i32 ze ; RV32-NEXT: vid.v v10 ; RV32-NEXT: vmsltu.vx v10, v10, a0 ; RV32-NEXT: vmand.mm v9, v9, v10 -; RV32-NEXT: vmandn.mm v8, v8, v9 -; RV32-NEXT: vmand.mm v9, v0, v9 -; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: vmand.mm v10, v0, v9 +; RV32-NEXT: vmnot.m v9, v9 +; RV32-NEXT: vmand.mm v8, v8, v9 +; RV32-NEXT: vmor.mm v0, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vpmerge_vv_v4i1: @@ -28,9 +29,10 @@ define <4 x i1> @vpmerge_vv_v4i1(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %m, i32 ze ; RV64-NEXT: vid.v v10 ; RV64-NEXT: vmsltu.vx v12, v10, a0 ; RV64-NEXT: vmand.mm v9, v9, v12 -; RV64-NEXT: vmandn.mm v8, v8, v9 -; RV64-NEXT: vmand.mm v9, v0, v9 -; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: vmand.mm v10, v0, v9 +; RV64-NEXT: vmnot.m v9, v9 +; RV64-NEXT: vmand.mm v8, v8, v9 +; RV64-NEXT: vmor.mm v0, v10, v8 ; RV64-NEXT: ret ; ; RV32ZVFHMIN-LABEL: vpmerge_vv_v4i1: @@ -39,9 +41,10 @@ define <4 x i1> @vpmerge_vv_v4i1(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %m, i32 ze ; RV32ZVFHMIN-NEXT: vid.v v10 ; RV32ZVFHMIN-NEXT: vmsltu.vx v10, v10, a0 ; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v10 -; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV32ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV32ZVFHMIN-NEXT: ret ; ; RV64ZVFHMIN-LABEL: vpmerge_vv_v4i1: @@ -50,9 +53,10 @@ define <4 x i1> @vpmerge_vv_v4i1(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %m, i32 ze ; RV64ZVFHMIN-NEXT: vid.v v10 ; RV64ZVFHMIN-NEXT: vmsltu.vx v12, v10, a0 ; RV64ZVFHMIN-NEXT: vmand.mm v9, v9, v12 -; RV64ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV64ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV64ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV64ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV64ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV64ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV64ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV64ZVFHMIN-NEXT: ret %v = call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> %m, <4 x i1> %va, <4 x i1> %vb, i32 %evl) ret <4 x i1> %v @@ -65,9 +69,10 @@ define <8 x i1> @vpmerge_vv_v8i1(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %m, i32 ze ; RV32-NEXT: vid.v v10 ; RV32-NEXT: vmsltu.vx v12, v10, a0 ; RV32-NEXT: vmand.mm v9, v9, v12 -; RV32-NEXT: vmandn.mm v8, v8, v9 -; RV32-NEXT: vmand.mm v9, v0, v9 -; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: vmand.mm v10, v0, v9 +; RV32-NEXT: vmnot.m v9, v9 +; RV32-NEXT: vmand.mm v8, v8, v9 +; RV32-NEXT: vmor.mm v0, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vpmerge_vv_v8i1: @@ -76,9 +81,10 @@ define <8 x i1> @vpmerge_vv_v8i1(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %m, i32 ze ; RV64-NEXT: vid.v v12 ; RV64-NEXT: vmsltu.vx v10, v12, a0 ; RV64-NEXT: vmand.mm v9, v9, v10 -; RV64-NEXT: vmandn.mm v8, v8, v9 -; RV64-NEXT: vmand.mm v9, v0, v9 -; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: vmand.mm v10, v0, v9 +; RV64-NEXT: vmnot.m v9, v9 +; RV64-NEXT: vmand.mm v8, v8, v9 +; RV64-NEXT: vmor.mm v0, v10, v8 ; RV64-NEXT: ret ; ; RV32ZVFHMIN-LABEL: vpmerge_vv_v8i1: @@ -87,9 +93,10 @@ define <8 x i1> @vpmerge_vv_v8i1(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %m, i32 ze ; RV32ZVFHMIN-NEXT: vid.v v10 ; RV32ZVFHMIN-NEXT: vmsltu.vx v12, v10, a0 ; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v12 -; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV32ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV32ZVFHMIN-NEXT: ret ; ; RV64ZVFHMIN-LABEL: vpmerge_vv_v8i1: @@ -98,9 +105,10 @@ define <8 x i1> @vpmerge_vv_v8i1(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %m, i32 ze ; RV64ZVFHMIN-NEXT: vid.v v12 ; RV64ZVFHMIN-NEXT: vmsltu.vx v10, v12, a0 ; RV64ZVFHMIN-NEXT: vmand.mm v9, v9, v10 -; RV64ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV64ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV64ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV64ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV64ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV64ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV64ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV64ZVFHMIN-NEXT: ret %v = call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> %m, <8 x i1> %va, <8 x i1> %vb, i32 %evl) ret <8 x i1> %v @@ -113,9 +121,10 @@ define <16 x i1> @vpmerge_vv_v16i1(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %m, i ; RV32-NEXT: vid.v v12 ; RV32-NEXT: vmsltu.vx v10, v12, a0 ; RV32-NEXT: vmand.mm v9, v9, v10 -; RV32-NEXT: vmandn.mm v8, v8, v9 -; RV32-NEXT: vmand.mm v9, v0, v9 -; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: vmand.mm v10, v0, v9 +; RV32-NEXT: vmnot.m v9, v9 +; RV32-NEXT: vmand.mm v8, v8, v9 +; RV32-NEXT: vmor.mm v0, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vpmerge_vv_v16i1: @@ -124,9 +133,10 @@ define <16 x i1> @vpmerge_vv_v16i1(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %m, i ; RV64-NEXT: vid.v v16 ; RV64-NEXT: vmsltu.vx v10, v16, a0 ; RV64-NEXT: vmand.mm v9, v9, v10 -; RV64-NEXT: vmandn.mm v8, v8, v9 -; RV64-NEXT: vmand.mm v9, v0, v9 -; RV64-NEXT: vmor.mm v0, v9, v8 +; RV64-NEXT: vmand.mm v10, v0, v9 +; RV64-NEXT: vmnot.m v9, v9 +; RV64-NEXT: vmand.mm v8, v8, v9 +; RV64-NEXT: vmor.mm v0, v10, v8 ; RV64-NEXT: ret ; ; RV32ZVFHMIN-LABEL: vpmerge_vv_v16i1: @@ -135,9 +145,10 @@ define <16 x i1> @vpmerge_vv_v16i1(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %m, i ; RV32ZVFHMIN-NEXT: vid.v v12 ; RV32ZVFHMIN-NEXT: vmsltu.vx v10, v12, a0 ; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v10 -; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV32ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV32ZVFHMIN-NEXT: ret ; ; RV64ZVFHMIN-LABEL: vpmerge_vv_v16i1: @@ -146,9 +157,10 @@ define <16 x i1> @vpmerge_vv_v16i1(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %m, i ; RV64ZVFHMIN-NEXT: vid.v v16 ; RV64ZVFHMIN-NEXT: vmsltu.vx v10, v16, a0 ; RV64ZVFHMIN-NEXT: vmand.mm v9, v9, v10 -; RV64ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV64ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV64ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV64ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV64ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV64ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV64ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV64ZVFHMIN-NEXT: ret %v = call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> %m, <16 x i1> %va, <16 x i1> %vb, i32 %evl) ret <16 x i1> %v @@ -162,9 +174,10 @@ define <32 x i1> @vpmerge_vv_v32i1(<32 x i1> %va, <32 x i1> %vb, <32 x i1> %m, i ; RV32-NEXT: vid.v v16 ; RV32-NEXT: vmsltu.vx v10, v16, a0 ; RV32-NEXT: vmand.mm v9, v9, v10 -; RV32-NEXT: vmandn.mm v8, v8, v9 -; RV32-NEXT: vmand.mm v9, v0, v9 -; RV32-NEXT: vmor.mm v0, v9, v8 +; RV32-NEXT: vmand.mm v10, v0, v9 +; RV32-NEXT: vmnot.m v9, v9 +; RV32-NEXT: vmand.mm v8, v8, v9 +; RV32-NEXT: vmor.mm v0, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vpmerge_vv_v32i1: @@ -190,9 +203,10 @@ define <32 x i1> @vpmerge_vv_v32i1(<32 x i1> %va, <32 x i1> %vb, <32 x i1> %m, i ; RV32ZVFHMIN-NEXT: vid.v v16 ; RV32ZVFHMIN-NEXT: vmsltu.vx v10, v16, a0 ; RV32ZVFHMIN-NEXT: vmand.mm v9, v9, v10 -; RV32ZVFHMIN-NEXT: vmandn.mm v8, v8, v9 -; RV32ZVFHMIN-NEXT: vmand.mm v9, v0, v9 -; RV32ZVFHMIN-NEXT: vmor.mm v0, v9, v8 +; RV32ZVFHMIN-NEXT: vmand.mm v10, v0, v9 +; RV32ZVFHMIN-NEXT: vmnot.m v9, v9 +; RV32ZVFHMIN-NEXT: vmand.mm v8, v8, v9 +; RV32ZVFHMIN-NEXT: vmor.mm v0, v10, v8 ; RV32ZVFHMIN-NEXT: ret ; ; RV64ZVFHMIN-LABEL: vpmerge_vv_v32i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll index c361ccce14e4a..8d23c7dd565ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpscatter.ll @@ -374,18 +374,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i define void @vpscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v9, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v10, v9, v9 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v10, v9 +; RV64-NEXT: vadd.vv v9, v10, v10 ; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV64-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV64-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds i16, ptr %base, <8 x i16> %eidxs @@ -396,8 +398,9 @@ define void @vpscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i define void @vpscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v8i16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret @@ -556,20 +559,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i define void @vpscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v11, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v11, v10 +; RV32-NEXT: vsll.vi v10, v11, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v11, v10, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v11, v10 +; RV64-NEXT: vsll.vi v10, v11, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV64-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs @@ -580,11 +583,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i define void @vpscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_v8i16_v8i32: @@ -603,11 +606,11 @@ define void @vpscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> define void @vpscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8i32: @@ -627,20 +630,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x define void @vpscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8i32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v12, v10, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v12, v10 +; RV64-NEXT: vsll.vi v10, v12, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV64-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds i32, ptr %base, <8 x i32> %eidxs @@ -659,9 +662,9 @@ define void @vpscatter_baseidx_v8i32(<8 x i32> %val, ptr %base, <8 x i32> %idxs, ; ; RV64-LABEL: vpscatter_baseidx_v8i32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v10, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v10 +; RV64-NEXT: vsll.vi v12, v12, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -790,20 +793,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i define void @vpscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v13, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v13, v12 +; RV32-NEXT: vsll.vi v12, v13, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v13, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v13, v12 +; RV64-NEXT: vsll.vi v12, v13, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV64-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs @@ -814,11 +817,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i define void @vpscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v8i16_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v14, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v14, v12 +; RV32-NEXT: vsll.vi v12, v14, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_v8i16_v8i64: @@ -837,11 +840,11 @@ define void @vpscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> define void @vpscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v14, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v14, v12 +; RV32-NEXT: vsll.vi v12, v14, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8i64: @@ -861,20 +864,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x define void @vpscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8i64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v14, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v14, v12 +; RV32-NEXT: vsll.vi v12, v14, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v14, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v14, v12 +; RV64-NEXT: vsll.vi v12, v14, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV64-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs @@ -893,11 +896,11 @@ define void @vpscatter_baseidx_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x i32> ; ; RV64-LABEL: vpscatter_baseidx_v8i32_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v16, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds i64, ptr %base, <8 x i32> %idxs call void @llvm.vp.scatter.v8i64.v8p0(<8 x i64> %val, <8 x ptr> %ptrs, <8 x i1> %m, i32 %evl) @@ -915,11 +918,11 @@ define void @vpscatter_baseidx_sext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x ; ; RV64-LABEL: vpscatter_baseidx_sext_v8i32_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v16, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs @@ -938,11 +941,11 @@ define void @vpscatter_baseidx_zext_v8i32_v8i64(<8 x i64> %val, ptr %base, <8 x ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i32_v8i64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulu.vx v16, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vzext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds i64, ptr %base, <8 x i64> %eidxs @@ -1162,18 +1165,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x define void @vpscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwaddu.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v9, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8f16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwaddu.vv v10, v9, v9 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v10, v9 +; RV64-NEXT: vadd.vv v9, v10, v10 ; RV64-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; RV64-NEXT: vsoxei16.v v8, (a0), v10, v0.t +; RV64-NEXT: vsoxei16.v v8, (a0), v9, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i16> %ptrs = getelementptr inbounds half, ptr %base, <8 x i16> %eidxs @@ -1184,8 +1189,9 @@ define void @vpscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x define void @vpscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v8f16: ; RV32: # %bb.0: -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwadd.vv v10, v9, v9 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v10, v9 +; RV32-NEXT: vadd.vv v10, v10, v10 ; RV32-NEXT: vsetvli zero, a1, e16, m1, ta, ma ; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret @@ -1323,20 +1329,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x define void @vpscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v11, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v11, v10 +; RV32-NEXT: vsll.vi v10, v11, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v11, v10, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v11, v10 +; RV64-NEXT: vsll.vi v10, v11, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vsoxei16.v v8, (a0), v11, v0.t +; RV64-NEXT: vsoxei16.v v8, (a0), v10, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs @@ -1347,11 +1353,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x define void @vpscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_v8i16_v8f32: @@ -1370,11 +1376,11 @@ define void @vpscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16 define void @vpscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v12, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8f32: @@ -1394,20 +1400,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 define void @vpscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8f32: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 4 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v12, v10, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v12, v10 +; RV32-NEXT: vsll.vi v10, v12, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v12, v10, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v12, v10 +; RV64-NEXT: vsll.vi v10, v12, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vsoxei32.v v8, (a0), v12, v0.t +; RV64-NEXT: vsoxei32.v v8, (a0), v10, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i32> %ptrs = getelementptr inbounds float, ptr %base, <8 x i32> %eidxs @@ -1426,9 +1432,9 @@ define void @vpscatter_baseidx_v8f32(<8 x float> %val, ptr %base, <8 x i32> %idx ; ; RV64-LABEL: vpscatter_baseidx_v8f32: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v12, v10, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v12, v10 +; RV64-NEXT: vsll.vi v12, v12, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma ; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret @@ -1557,20 +1563,20 @@ define void @vpscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 define void @vpscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i8_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vwmulu.vx v13, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV32-NEXT: vzext.vf2 v13, v12 +; RV32-NEXT: vsll.vi v12, v13, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV32-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i8_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vwmulu.vx v13, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; RV64-NEXT: vzext.vf2 v13, v12 +; RV64-NEXT: vsll.vi v12, v13, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei16.v v8, (a0), v13, v0.t +; RV64-NEXT: vsoxei16.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i8> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs @@ -1581,11 +1587,11 @@ define void @vpscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 define void @vpscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_v8i16_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v14, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v14, v12 +; RV32-NEXT: vsll.vi v12, v14, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_v8i16_v8f64: @@ -1604,11 +1610,11 @@ define void @vpscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i1 define void @vpscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_sext_v8i16_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulsu.vx v14, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vsext.vf2 v14, v12 +; RV32-NEXT: vsll.vi v12, v14, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_sext_v8i16_v8f64: @@ -1628,20 +1634,20 @@ define void @vpscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 define void @vpscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16> %idxs, <8 x i1> %m, i32 zeroext %evl) { ; RV32-LABEL: vpscatter_baseidx_zext_v8i16_v8f64: ; RV32: # %bb.0: -; RV32-NEXT: li a2, 8 -; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV32-NEXT: vwmulu.vx v14, v12, a2 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vzext.vf2 v14, v12 +; RV32-NEXT: vsll.vi v12, v14, 3 ; RV32-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV32-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV32-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV32-NEXT: ret ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i16_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; RV64-NEXT: vwmulu.vx v14, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vzext.vf2 v14, v12 +; RV64-NEXT: vsll.vi v12, v14, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei32.v v8, (a0), v14, v0.t +; RV64-NEXT: vsoxei32.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i16> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs @@ -1660,11 +1666,11 @@ define void @vpscatter_baseidx_v8i32_v8f64(<8 x double> %val, ptr %base, <8 x i3 ; ; RV64-LABEL: vpscatter_baseidx_v8i32_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v16, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %ptrs = getelementptr inbounds double, ptr %base, <8 x i32> %idxs call void @llvm.vp.scatter.v8f64.v8p0(<8 x double> %val, <8 x ptr> %ptrs, <8 x i1> %m, i32 %evl) @@ -1682,11 +1688,11 @@ define void @vpscatter_baseidx_sext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; ; RV64-LABEL: vpscatter_baseidx_sext_v8i32_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulsu.vx v16, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = sext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs @@ -1705,11 +1711,11 @@ define void @vpscatter_baseidx_zext_v8i32_v8f64(<8 x double> %val, ptr %base, <8 ; ; RV64-LABEL: vpscatter_baseidx_zext_v8i32_v8f64: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 8 -; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV64-NEXT: vwmulu.vx v16, v12, a2 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vzext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v12, v16, 3 ; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma -; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v12, v0.t ; RV64-NEXT: ret %eidxs = zext <8 x i32> %idxs to <8 x i64> %ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs @@ -1842,35 +1848,36 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 4 +; RV64-NEXT: slli a4, a3, 3 +; RV64-NEXT: add a3, a4, a3 ; RV64-NEXT: sub sp, sp, a3 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x09, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 9 * vlenb +; RV64-NEXT: addi a3, sp, 16 +; RV64-NEXT: vs1r.v v0, (a3) # vscale x 8-byte Folded Spill ; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: slli a3, a3, 3 ; RV64-NEXT: add a3, sp, a3 ; RV64-NEXT: addi a3, a3, 16 ; RV64-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill -; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: li a1, 8 ; RV64-NEXT: li a3, 16 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v24, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwmulsu.vx v8, v16, a1 -; RV64-NEXT: vwmulsu.vx v16, v24, a1 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v0, v24 +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsll.vi v24, v0, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB84_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB84_2: ; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl1r.v v0, (a3) # vscale x 8-byte Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 @@ -1878,14 +1885,14 @@ define void @vpscatter_baseidx_v32i32_v32f64(<32 x double> %val, ptr %base, <32 ; RV64-NEXT: vslidedown.vi v0, v0, 2 ; RV64-NEXT: and a1, a2, a1 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: add a0, a1, a0 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: .cfi_def_cfa sp, 16 ; RV64-NEXT: addi sp, sp, 16 @@ -1941,22 +1948,24 @@ define void @vpscatter_baseidx_sext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: li a1, 8 ; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v16, v24 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v24, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwmulsu.vx v8, v16, a1 -; RV64-NEXT: vwmulsu.vx v16, v24, a1 +; RV64-NEXT: vslidedown.vi v8, v24, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB85_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB85_2: ; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 @@ -2028,22 +2037,24 @@ define void @vpscatter_baseidx_zext_v32i32_v32f64(<32 x double> %val, ptr %base, ; RV64-NEXT: li a3, 32 ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle32.v v24, (a1) -; RV64-NEXT: li a1, 8 ; RV64-NEXT: li a3, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v16, v24 ; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV64-NEXT: vslidedown.vi v16, v24, 16 -; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV64-NEXT: vwmulu.vx v8, v16, a1 -; RV64-NEXT: vwmulu.vx v16, v24, a1 +; RV64-NEXT: vslidedown.vi v8, v24, 16 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsll.vi v24, v16, 3 ; RV64-NEXT: mv a1, a2 ; RV64-NEXT: bltu a2, a3, .LBB86_2 ; RV64-NEXT: # %bb.1: ; RV64-NEXT: li a1, 16 ; RV64-NEXT: .LBB86_2: ; RV64-NEXT: addi a3, sp, 16 -; RV64-NEXT: vl8r.v v24, (a3) # vscale x 64-byte Folded Reload +; RV64-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV64-NEXT: vsoxei64.v v24, (a0), v16, v0.t +; RV64-NEXT: vsoxei64.v v16, (a0), v24, v0.t ; RV64-NEXT: addi a1, a2, -16 ; RV64-NEXT: sltu a2, a2, a1 ; RV64-NEXT: addi a2, a2, -1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll index 0d31ec5f78435..f5dce90c4cbf8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vreductions-mask.ll @@ -902,7 +902,8 @@ define zeroext i1 @vreduce_and_v256i1(<256 x i1> %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmnand.mm v8, v0, v8 +; CHECK-NEXT: vmand.mm v8, v0, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -932,7 +933,8 @@ define zeroext i1 @vreduce_smax_v256i1(<256 x i1> %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmnand.mm v8, v0, v8 +; CHECK-NEXT: vmand.mm v8, v0, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -947,7 +949,8 @@ define zeroext i1 @vreduce_umin_v256i1(<256 x i1> %v) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vmnand.mm v8, v0, v8 +; CHECK-NEXT: vmand.mm v8, v0, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -1013,7 +1016,8 @@ define zeroext i1 @vreduce_and_v512i1(<512 x i1> %v) { ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmand.mm v8, v8, v10 ; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmnand.mm v8, v9, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -1047,7 +1051,8 @@ define zeroext i1 @vreduce_smax_v512i1(<512 x i1> %v) { ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmand.mm v8, v8, v10 ; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmnand.mm v8, v9, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -1064,7 +1069,8 @@ define zeroext i1 @vreduce_umin_v512i1(<512 x i1> %v) { ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vmand.mm v8, v8, v10 ; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmnand.mm v8, v9, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -1144,7 +1150,8 @@ define zeroext i1 @vreduce_and_v1024i1(<1024 x i1> %v) { ; CHECK-NEXT: vmand.mm v11, v0, v11 ; CHECK-NEXT: vmand.mm v8, v8, v10 ; CHECK-NEXT: vmand.mm v9, v11, v9 -; CHECK-NEXT: vmnand.mm v8, v9, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -1186,7 +1193,8 @@ define zeroext i1 @vreduce_smax_v1024i1(<1024 x i1> %v) { ; CHECK-NEXT: vmand.mm v11, v0, v11 ; CHECK-NEXT: vmand.mm v8, v8, v10 ; CHECK-NEXT: vmand.mm v9, v11, v9 -; CHECK-NEXT: vmnand.mm v8, v9, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret @@ -1207,7 +1215,8 @@ define zeroext i1 @vreduce_umin_v1024i1(<1024 x i1> %v) { ; CHECK-NEXT: vmand.mm v11, v0, v11 ; CHECK-NEXT: vmand.mm v8, v8, v10 ; CHECK-NEXT: vmand.mm v9, v11, v9 -; CHECK-NEXT: vmnand.mm v8, v9, v8 +; CHECK-NEXT: vmand.mm v8, v9, v8 +; CHECK-NEXT: vmnot.m v8, v8 ; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: seqz a0, a0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll index 32ae81926bbee..8e41684bd4b2b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrol.ll @@ -950,16 +950,18 @@ define <2 x i64> @vrol_vx_v2i64(<2 x i64> %a, i64 %b) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a1, 63 ; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vmv.v.i v10, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vand.vx v9, v9, a1 +; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV32-NEXT: vwsub.vx v11, v10, a0 -; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; RV32-NEXT: vand.vx v9, v9, a0 -; RV32-NEXT: vand.vx v10, v11, a0 -; RV32-NEXT: vsrl.vv v10, v8, v10 -; RV32-NEXT: vsll.vv v8, v8, v9 -; RV32-NEXT: vor.vv v8, v8, v10 +; RV32-NEXT: vsll.vv v9, v8, v9 +; RV32-NEXT: vand.vx v10, v11, a1 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v9, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vrol_vx_v2i64: @@ -1015,16 +1017,18 @@ define <4 x i64> @vrol_vx_v4i64(<4 x i64> %a, i64 %b) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a1, 63 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vmv.v.i v14, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vand.vx v10, v10, a1 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vwsub.vx v12, v14, a0 -; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; RV32-NEXT: vand.vx v10, v10, a0 -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vsrl.vv v12, v8, v12 -; RV32-NEXT: vsll.vv v8, v8, v10 -; RV32-NEXT: vor.vv v8, v8, v12 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vsrl.vv v8, v8, v12 +; RV32-NEXT: vor.vv v8, v10, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vrol_vx_v4i64: @@ -1080,16 +1084,18 @@ define <8 x i64> @vrol_vx_v8i64(<8 x i64> %a, i64 %b) { ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV32-NEXT: vmv.v.x v12, a0 +; RV32-NEXT: li a1, 63 ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vmv.v.i v20, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vand.vx v12, v12, a1 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vwsub.vx v16, v20, a0 -; RV32-NEXT: li a0, 63 ; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; RV32-NEXT: vand.vx v12, v12, a0 -; RV32-NEXT: vand.vx v16, v16, a0 -; RV32-NEXT: vsrl.vv v16, v8, v16 -; RV32-NEXT: vsll.vv v8, v8, v12 -; RV32-NEXT: vor.vv v8, v8, v16 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vand.vx v16, v16, a1 +; RV32-NEXT: vsrl.vv v8, v8, v16 +; RV32-NEXT: vor.vv v8, v12, v8 ; RV32-NEXT: ret ; ; RV64-LABEL: vrol_vx_v8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll index ec22d2be1eaad..9a958de85922e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vror.ll @@ -1603,16 +1603,16 @@ define <1 x i64> @vror_vx_v1i64(<1 x i64> %a, i64 %b) { define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v1i64: ; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v9, 1 -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: vmv.s.x v10, a0 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: vand.vi v10, v10, 1 -; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v8, v9 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v10, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v1i64: @@ -1636,16 +1636,16 @@ define <1 x i64> @vror_vi_v1i64(<1 x i64> %a) { define <1 x i64> @vror_vi_rotl_v1i64(<1 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v1i64: ; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v9, 1 -; CHECK-RV32-NEXT: li a0, 63 -; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: vmv.s.x v10, a0 -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vrsub.vi v9, v9, 0 ; CHECK-RV32-NEXT: vand.vi v10, v10, 1 -; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v8, v9 +; CHECK-RV32-NEXT: vand.vx v9, v9, a0 +; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 +; CHECK-RV32-NEXT: vor.vv v8, v10, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v1i64: @@ -1696,16 +1696,18 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v9, a0 +; CHECK-RV32-NEXT: li a1, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v10, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vx v9, v9, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; CHECK-RV32-NEXT: vwsub.vx v11, v10, a0 -; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vand.vx v9, v9, a0 -; CHECK-RV32-NEXT: vand.vx v10, v11, a0 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 +; CHECK-RV32-NEXT: vand.vx v10, v11, a1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v9, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_v2i64: @@ -1735,18 +1737,20 @@ define <2 x i64> @vror_vx_v2i64(<2 x i64> %a, i64 %b) { define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v2i64: ; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v9, 0 -; CHECK-RV32-NEXT: li a0, 1 -; CHECK-RV32-NEXT: vwsubu.vx v10, v9, a0 -; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v9, a0 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vi v9, v9, 1 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vwsubu.vx v11, v9, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vi v9, v10, 1 +; CHECK-RV32-NEXT: vand.vx v10, v11, a0 +; CHECK-RV32-NEXT: vsrl.vv v9, v8, v9 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v9, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v2i64: @@ -1770,18 +1774,20 @@ define <2 x i64> @vror_vi_v2i64(<2 x i64> %a) { define <2 x i64> @vror_vi_rotl_v2i64(<2 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v2i64: ; CHECK-RV32: # %bb.0: +; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v9, 0 -; CHECK-RV32-NEXT: li a0, 1 -; CHECK-RV32-NEXT: vwsubu.vx v10, v9, a0 -; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v9, a0 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vi v9, v9, 1 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v9 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-RV32-NEXT: vwsubu.vx v11, v9, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-RV32-NEXT: vand.vi v9, v10, 1 +; CHECK-RV32-NEXT: vand.vx v10, v11, a0 +; CHECK-RV32-NEXT: vsll.vv v9, v8, v9 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 +; CHECK-RV32-NEXT: vor.vv v8, v9, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v2i64: @@ -1832,16 +1838,18 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: li a1, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v14, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vx v10, v10, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; CHECK-RV32-NEXT: vwsub.vx v12, v14, a0 -; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v10 -; CHECK-RV32-NEXT: vor.vv v8, v8, v12 +; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 +; CHECK-RV32-NEXT: vand.vx v12, v12, a1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v10, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_v4i64: @@ -1871,18 +1879,20 @@ define <4 x i64> @vror_vx_v4i64(<4 x i64> %a, i64 %b) { define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v4i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 0 -; CHECK-RV32-NEXT: li a0, 1 -; CHECK-RV32-NEXT: vwsubu.vx v10, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v14, 0 +; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vi v12, v12, 1 -; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-RV32-NEXT: vwsubu.vx v12, v14, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v10, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v4i64: @@ -1906,18 +1916,20 @@ define <4 x i64> @vror_vi_v4i64(<4 x i64> %a) { define <4 x i64> @vror_vi_rotl_v4i64(<4 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v4i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v12, 0 -; CHECK-RV32-NEXT: li a0, 1 -; CHECK-RV32-NEXT: vwsubu.vx v10, v12, a0 ; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v14, 0 +; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v12, a0 -; CHECK-RV32-NEXT: vand.vx v10, v10, a0 -; CHECK-RV32-NEXT: vand.vi v12, v12, 1 -; CHECK-RV32-NEXT: vsrl.vv v10, v8, v10 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v8, v10 +; CHECK-RV32-NEXT: vmv.v.x v10, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-RV32-NEXT: vwsubu.vx v12, v14, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-RV32-NEXT: vand.vi v10, v10, 1 +; CHECK-RV32-NEXT: vand.vx v12, v12, a0 +; CHECK-RV32-NEXT: vsll.vv v10, v8, v10 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 +; CHECK-RV32-NEXT: vor.vv v8, v10, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v4i64: @@ -1968,16 +1980,18 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: li a1, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: vmv.v.i v20, 0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vx v12, v12, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-RV32-NEXT: vwsub.vx v16, v20, a0 -; CHECK-RV32-NEXT: li a0, 63 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vand.vx v16, v16, a0 -; CHECK-RV32-NEXT: vsll.vv v16, v8, v16 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v12 -; CHECK-RV32-NEXT: vor.vv v8, v8, v16 +; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 +; CHECK-RV32-NEXT: vand.vx v16, v16, a1 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v12, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vx_v8i64: @@ -2007,18 +2021,20 @@ define <8 x i64> @vror_vx_v8i64(<8 x i64> %a, i64 %b) { define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_v8i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v16, 0 -; CHECK-RV32-NEXT: li a0, 1 -; CHECK-RV32-NEXT: vwsubu.vx v12, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v20, 0 +; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v16, a0 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vand.vi v16, v16, 1 -; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v8, v12 +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vwsubu.vx v16, v20, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vi v12, v12, 1 +; CHECK-RV32-NEXT: vand.vx v16, v16, a0 +; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 +; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v12, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_v8i64: @@ -2042,18 +2058,20 @@ define <8 x i64> @vror_vi_v8i64(<8 x i64> %a) { define <8 x i64> @vror_vi_rotl_v8i64(<8 x i64> %a) { ; CHECK-RV32-LABEL: vror_vi_rotl_v8i64: ; CHECK-RV32: # %bb.0: -; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-RV32-NEXT: vmv.v.i v16, 0 -; CHECK-RV32-NEXT: li a0, 1 -; CHECK-RV32-NEXT: vwsubu.vx v12, v16, a0 ; CHECK-RV32-NEXT: li a0, 63 +; CHECK-RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-RV32-NEXT: vmv.v.i v20, 0 +; CHECK-RV32-NEXT: li a1, 1 ; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-RV32-NEXT: vmv.v.x v16, a0 -; CHECK-RV32-NEXT: vand.vx v12, v12, a0 -; CHECK-RV32-NEXT: vand.vi v16, v16, 1 -; CHECK-RV32-NEXT: vsrl.vv v12, v8, v12 -; CHECK-RV32-NEXT: vsll.vv v8, v8, v16 -; CHECK-RV32-NEXT: vor.vv v8, v8, v12 +; CHECK-RV32-NEXT: vmv.v.x v12, a0 +; CHECK-RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-RV32-NEXT: vwsubu.vx v16, v20, a1 +; CHECK-RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-RV32-NEXT: vand.vi v12, v12, 1 +; CHECK-RV32-NEXT: vand.vx v16, v16, a0 +; CHECK-RV32-NEXT: vsll.vv v12, v8, v12 +; CHECK-RV32-NEXT: vsrl.vv v8, v8, v16 +; CHECK-RV32-NEXT: vor.vv v8, v12, v8 ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64-LABEL: vror_vi_rotl_v8i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll index 8b0e6c1a00811..e68fca6f6faeb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vscale-range.ll @@ -35,39 +35,24 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: addi a2, a1, 128 -; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vle8.v v0, (a1) ; CHECK-NEXT: addi a1, a3, 256 ; CHECK-NEXT: vle8.v v8, (a1) -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle8.v v8, (a2) -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vle8.v v16, (a2) ; CHECK-NEXT: vle8.v v24, (a4) -; CHECK-NEXT: vle8.v v0, (a3) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vle8.v v24, (a3) ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 4 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vadd.vv v8, v0, v8 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: add a1, sp, a1 -; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a2, 24 @@ -75,34 +60,39 @@ define <512 x i8> @vadd_v512i8_zvl128(<512 x i8> %a, <512 x i8> %b) #0 { ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v16, v16, v8 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v24, v8, v24 +; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v16, v16, v24 ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: li a2, 40 ; CHECK-NEXT: mul a1, a1, a2 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v0, v8, v0 -; CHECK-NEXT: vse8.v v0, (a0) +; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8r.v v0, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v24, v24, v0 +; CHECK-NEXT: vse8.v v24, (a0) ; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vse8.v v16, (a1) +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: addi a1, a0, 256 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vse8.v v24, (a0) +; CHECK-NEXT: vse8.v v16, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: li a1, 48 ; CHECK-NEXT: mul a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll index 7b2dcbb025f8f..d718a5389bd66 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -14,8 +14,9 @@ define <1 x i1> @select_v1i1(<1 x i1> %a, <1 x i1> %b, <1 x i1> %c, i32 zeroext ; CHECK-LABEL: select_v1i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmandn.mm v9, v9, v0 ; CHECK-NEXT: vmand.mm v8, v8, v0 +; CHECK-NEXT: vmnot.m v10, v0 +; CHECK-NEXT: vmand.mm v9, v9, v10 ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %v = call <1 x i1> @llvm.vp.select.v1i1(<1 x i1> %a, <1 x i1> %b, <1 x i1> %c, i32 %evl) @@ -28,8 +29,9 @@ define <2 x i1> @select_v2i1(<2 x i1> %a, <2 x i1> %b, <2 x i1> %c, i32 zeroext ; CHECK-LABEL: select_v2i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma -; CHECK-NEXT: vmandn.mm v9, v9, v0 ; CHECK-NEXT: vmand.mm v8, v8, v0 +; CHECK-NEXT: vmnot.m v10, v0 +; CHECK-NEXT: vmand.mm v9, v9, v10 ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %v = call <2 x i1> @llvm.vp.select.v2i1(<2 x i1> %a, <2 x i1> %b, <2 x i1> %c, i32 %evl) @@ -42,8 +44,9 @@ define <4 x i1> @select_v4i1(<4 x i1> %a, <4 x i1> %b, <4 x i1> %c, i32 zeroext ; CHECK-LABEL: select_v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, ma -; CHECK-NEXT: vmandn.mm v9, v9, v0 ; CHECK-NEXT: vmand.mm v8, v8, v0 +; CHECK-NEXT: vmnot.m v10, v0 +; CHECK-NEXT: vmand.mm v9, v9, v10 ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %v = call <4 x i1> @llvm.vp.select.v4i1(<4 x i1> %a, <4 x i1> %b, <4 x i1> %c, i32 %evl) @@ -56,8 +59,9 @@ define <8 x i1> @select_v8i1(<8 x i1> %a, <8 x i1> %b, <8 x i1> %c, i32 zeroext ; CHECK-LABEL: select_v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma -; CHECK-NEXT: vmandn.mm v9, v9, v0 ; CHECK-NEXT: vmand.mm v8, v8, v0 +; CHECK-NEXT: vmnot.m v10, v0 +; CHECK-NEXT: vmand.mm v9, v9, v10 ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %v = call <8 x i1> @llvm.vp.select.v8i1(<8 x i1> %a, <8 x i1> %b, <8 x i1> %c, i32 %evl) @@ -70,8 +74,9 @@ define <16 x i1> @select_v16i1(<16 x i1> %a, <16 x i1> %b, <16 x i1> %c, i32 zer ; CHECK-LABEL: select_v16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma -; CHECK-NEXT: vmandn.mm v9, v9, v0 ; CHECK-NEXT: vmand.mm v8, v8, v0 +; CHECK-NEXT: vmnot.m v10, v0 +; CHECK-NEXT: vmand.mm v9, v9, v10 ; CHECK-NEXT: vmor.mm v0, v8, v9 ; CHECK-NEXT: ret %v = call <16 x i1> @llvm.vp.select.v16i1(<16 x i1> %a, <16 x i1> %b, <16 x i1> %c, i32 %evl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll index b97fa1d3a51ec..e703866cb3f12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect.ll @@ -7,30 +7,31 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32: # %bb.0: ; RV32-NEXT: lbu a2, 0(a2) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vle32.v v10, (a1) +; RV32-NEXT: slli a0, a2, 30 +; RV32-NEXT: andi a1, a2, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: slli a4, a2, 29 +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: slli a0, a2, 28 ; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 28 -; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: slli a4, a2, 27 +; RV32-NEXT: vslide1down.vx v12, v12, a1 +; RV32-NEXT: slli a1, a2, 27 ; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: vslide1down.vx v12, v12, a1 +; RV32-NEXT: vslide1down.vx v12, v12, a2 ; RV32-NEXT: vsetivli zero, 6, e8, mf2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v10, 2 -; RV32-NEXT: vand.vi v10, v10, 1 -; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; RV32-NEXT: vle32.v v8, (a0), v0.t +; RV32-NEXT: vslidedown.vi v12, v12, 2 +; RV32-NEXT: vand.vi v12, v12, 1 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: vse32.v v8, (a3) ; RV32-NEXT: ret ; @@ -38,30 +39,31 @@ define void @vselect_vv_v6i32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV64: # %bb.0: ; RV64-NEXT: lbu a2, 0(a2) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a1) -; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vle32.v v10, (a1) +; RV64-NEXT: slli a0, a2, 62 +; RV64-NEXT: andi a1, a2, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: slli a4, a2, 61 +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: slli a0, a2, 60 ; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 60 -; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: slli a4, a2, 59 +; RV64-NEXT: vslide1down.vx v12, v12, a1 +; RV64-NEXT: slli a1, a2, 59 ; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: vslide1down.vx v12, v12, a1 +; RV64-NEXT: vslide1down.vx v12, v12, a2 ; RV64-NEXT: vsetivli zero, 6, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v10, 2 -; RV64-NEXT: vand.vi v10, v10, 1 -; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; RV64-NEXT: vle32.v v8, (a0), v0.t +; RV64-NEXT: vslidedown.vi v12, v12, 2 +; RV64-NEXT: vand.vi v12, v12, 1 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: vse32.v v8, (a3) ; RV64-NEXT: ret %va = load <6 x i32>, ptr %a @@ -218,30 +220,31 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV32: # %bb.0: ; RV32-NEXT: lbu a2, 0(a2) ; RV32-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: slli a1, a2, 30 -; RV32-NEXT: andi a4, a2, 1 +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vle32.v v10, (a1) +; RV32-NEXT: slli a0, a2, 30 +; RV32-NEXT: andi a1, a2, 1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV32-NEXT: vmv.v.x v10, a4 -; RV32-NEXT: slli a4, a2, 29 +; RV32-NEXT: vmv.v.x v12, a1 +; RV32-NEXT: slli a1, a2, 29 +; RV32-NEXT: srli a0, a0, 31 +; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: slli a0, a2, 28 ; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: slli a1, a2, 28 -; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: slli a4, a2, 27 +; RV32-NEXT: vslide1down.vx v12, v12, a1 +; RV32-NEXT: slli a1, a2, 27 ; RV32-NEXT: srli a2, a2, 5 +; RV32-NEXT: srli a0, a0, 31 ; RV32-NEXT: srli a1, a1, 31 -; RV32-NEXT: srli a4, a4, 31 -; RV32-NEXT: vslide1down.vx v10, v10, a1 -; RV32-NEXT: vslide1down.vx v10, v10, a4 -; RV32-NEXT: vslide1down.vx v10, v10, a2 +; RV32-NEXT: vslide1down.vx v12, v12, a0 +; RV32-NEXT: vslide1down.vx v12, v12, a1 +; RV32-NEXT: vslide1down.vx v12, v12, a2 ; RV32-NEXT: vsetivli zero, 6, e8, mf2, ta, ma -; RV32-NEXT: vslidedown.vi v10, v10, 2 -; RV32-NEXT: vand.vi v10, v10, 1 -; RV32-NEXT: vmsne.vi v0, v10, 0 -; RV32-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; RV32-NEXT: vle32.v v8, (a0), v0.t +; RV32-NEXT: vslidedown.vi v12, v12, 2 +; RV32-NEXT: vand.vi v12, v12, 1 +; RV32-NEXT: vmsne.vi v0, v12, 0 +; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV32-NEXT: vse32.v v8, (a3) ; RV32-NEXT: ret ; @@ -249,30 +252,31 @@ define void @vselect_vv_v6f32(ptr %a, ptr %b, ptr %cc, ptr %z) { ; RV64: # %bb.0: ; RV64-NEXT: lbu a2, 0(a2) ; RV64-NEXT: vsetivli zero, 6, e32, m2, ta, ma -; RV64-NEXT: vle32.v v8, (a1) -; RV64-NEXT: slli a1, a2, 62 -; RV64-NEXT: andi a4, a2, 1 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vle32.v v10, (a1) +; RV64-NEXT: slli a0, a2, 62 +; RV64-NEXT: andi a1, a2, 1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; RV64-NEXT: vmv.v.x v10, a4 -; RV64-NEXT: slli a4, a2, 61 +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: slli a1, a2, 61 +; RV64-NEXT: srli a0, a0, 63 +; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: slli a0, a2, 60 ; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: slli a1, a2, 60 -; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: slli a4, a2, 59 +; RV64-NEXT: vslide1down.vx v12, v12, a1 +; RV64-NEXT: slli a1, a2, 59 ; RV64-NEXT: srli a2, a2, 5 +; RV64-NEXT: srli a0, a0, 63 ; RV64-NEXT: srli a1, a1, 63 -; RV64-NEXT: srli a4, a4, 63 -; RV64-NEXT: vslide1down.vx v10, v10, a1 -; RV64-NEXT: vslide1down.vx v10, v10, a4 -; RV64-NEXT: vslide1down.vx v10, v10, a2 +; RV64-NEXT: vslide1down.vx v12, v12, a0 +; RV64-NEXT: vslide1down.vx v12, v12, a1 +; RV64-NEXT: vslide1down.vx v12, v12, a2 ; RV64-NEXT: vsetivli zero, 6, e8, mf2, ta, ma -; RV64-NEXT: vslidedown.vi v10, v10, 2 -; RV64-NEXT: vand.vi v10, v10, 1 -; RV64-NEXT: vmsne.vi v0, v10, 0 -; RV64-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; RV64-NEXT: vle32.v v8, (a0), v0.t +; RV64-NEXT: vslidedown.vi v12, v12, 2 +; RV64-NEXT: vand.vi v12, v12, 1 +; RV64-NEXT: vmsne.vi v0, v12, 0 +; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: vse32.v v8, (a3) ; RV64-NEXT: ret %va = load <6 x float>, ptr %a @@ -634,9 +638,10 @@ define <2 x i1> @vselect_v2i1(<2 x i1> %a, <2 x i1> %b, <2 x i1> %cc) { ; CHECK-LABEL: vselect_v2i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select <2 x i1> %cc, <2 x i1> %a, <2 x i1> %b ret <2 x i1> %v @@ -646,9 +651,10 @@ define <4 x i1> @vselect_v4i1(<4 x i1> %a, <4 x i1> %b, <4 x i1> %cc) { ; CHECK-LABEL: vselect_v4i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select <4 x i1> %cc, <4 x i1> %a, <4 x i1> %b ret <4 x i1> %v @@ -658,9 +664,10 @@ define <8 x i1> @vselect_v8i1(<8 x i1> %a, <8 x i1> %b, <8 x i1> %cc) { ; CHECK-LABEL: vselect_v8i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select <8 x i1> %cc, <8 x i1> %a, <8 x i1> %b ret <8 x i1> %v @@ -670,9 +677,10 @@ define <16 x i1> @vselect_v16i1(<16 x i1> %a, <16 x i1> %b, <16 x i1> %cc) { ; CHECK-LABEL: vselect_v16i1: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select <16 x i1> %cc, <16 x i1> %a, <16 x i1> %b ret <16 x i1> %v @@ -683,9 +691,10 @@ define <32 x i1> @vselect_v32i1(<32 x i1> %a, <32 x i1> %b, <32 x i1> %cc) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select <32 x i1> %cc, <32 x i1> %a, <32 x i1> %b ret <32 x i1> %v @@ -696,9 +705,10 @@ define <64 x i1> @vselect_v64i1(<64 x i1> %a, <64 x i1> %b, <64 x i1> %cc) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vmandn.mm v8, v8, v9 -; CHECK-NEXT: vmand.mm v9, v0, v9 -; CHECK-NEXT: vmor.mm v0, v9, v8 +; CHECK-NEXT: vmand.mm v10, v0, v9 +; CHECK-NEXT: vmnot.m v9, v9 +; CHECK-NEXT: vmand.mm v8, v8, v9 +; CHECK-NEXT: vmor.mm v0, v10, v8 ; CHECK-NEXT: ret %v = select <64 x i1> %cc, <64 x i1> %a, <64 x i1> %b ret <64 x i1> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll index 227a428831b60..856974bf87d21 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vw-web-simplification.ll @@ -34,16 +34,18 @@ define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { ; ; FOLDING-LABEL: vwmul_v2i16_multiple_users: ; FOLDING: # %bb.0: -; FOLDING-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; FOLDING-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; FOLDING-NEXT: vle8.v v8, (a0) ; FOLDING-NEXT: vle8.v v9, (a1) ; FOLDING-NEXT: vle8.v v10, (a2) -; FOLDING-NEXT: vwmul.vv v11, v8, v9 -; FOLDING-NEXT: vwadd.vv v9, v8, v10 -; FOLDING-NEXT: vwsub.vv v12, v8, v10 -; FOLDING-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; FOLDING-NEXT: vor.vv v8, v11, v9 -; FOLDING-NEXT: vor.vv v8, v8, v12 +; FOLDING-NEXT: vsext.vf2 v11, v8 +; FOLDING-NEXT: vsext.vf2 v8, v9 +; FOLDING-NEXT: vsext.vf2 v9, v10 +; FOLDING-NEXT: vmul.vv v8, v11, v8 +; FOLDING-NEXT: vadd.vv v10, v11, v9 +; FOLDING-NEXT: vsub.vv v9, v11, v9 +; FOLDING-NEXT: vor.vv v8, v8, v10 +; FOLDING-NEXT: vor.vv v8, v8, v9 ; FOLDING-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll index 7bac239cfffea..324bcb0995e8b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd-mask.ll @@ -5,13 +5,14 @@ define <8 x i64> @vwadd_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { ; CHECK-LABEL: vwadd_wv_mask_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v16, v8 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv4r.v v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; CHECK-NEXT: vwadd.wv v8, v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -23,13 +24,14 @@ define <8 x i64> @vwadd_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { define <8 x i64> @vwaddu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { ; CHECK-LABEL: vwaddu_wv_mask_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v16, v8 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv4r.v v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; CHECK-NEXT: vwaddu.wv v8, v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -41,13 +43,15 @@ define <8 x i64> @vwaddu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { define <8 x i64> @vwaddu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: vwaddu_vv_mask_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vmerge.vvm v14, v10, v8, v0 -; CHECK-NEXT: vwaddu.vv v8, v14, v12 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vzext.vf2 v16, v10 +; CHECK-NEXT: vadd.vv v8, v12, v16 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -60,13 +64,14 @@ define <8 x i64> @vwaddu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) { define <8 x i64> @vwadd_wv_mask_v8i32_commutative(<8 x i32> %x, <8 x i64> %y) { ; CHECK-LABEL: vwadd_wv_mask_v8i32_commutative: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v16, v8 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv4r.v v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; CHECK-NEXT: vwadd.wv v8, v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -83,7 +88,9 @@ define <8 x i64> @vwadd_wv_mask_v8i32_nonzero(<8 x i32> %x, <8 x i64> %y) { ; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmv.v.i v10, 1 ; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 -; CHECK-NEXT: vwadd.wv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll index b39fff64b1090..6a0f881250a1d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwadd_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,10 +23,12 @@ define <2 x i16> @vwadd_v2i16(ptr %x, ptr %y) { define <4 x i16> @vwadd_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -37,10 +41,12 @@ define <4 x i16> @vwadd_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwadd_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -53,10 +59,12 @@ define <2 x i32> @vwadd_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwadd_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -69,10 +77,12 @@ define <8 x i16> @vwadd_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwadd_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -85,10 +95,12 @@ define <4 x i32> @vwadd_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwadd_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwadd.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -101,10 +113,12 @@ define <2 x i64> @vwadd_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwadd_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -117,10 +131,12 @@ define <16 x i16> @vwadd_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwadd_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -133,10 +149,12 @@ define <8 x i32> @vwadd_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwadd_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -150,10 +168,12 @@ define <32 x i16> @vwadd_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwadd.vv v8, v12, v14 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -166,10 +186,12 @@ define <32 x i16> @vwadd_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwadd_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwadd.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -182,10 +204,12 @@ define <16 x i32> @vwadd_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwadd_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwadd.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -199,10 +223,12 @@ define <64 x i16> @vwadd_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwadd.vv v8, v16, v20 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -216,10 +242,12 @@ define <32 x i32> @vwadd_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwadd.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -232,10 +260,12 @@ define <32 x i32> @vwadd_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwadd_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwadd.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -255,18 +285,26 @@ define <128 x i16> @vwadd_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -290,18 +328,26 @@ define <64 x i32> @vwadd_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwadd.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -325,16 +371,24 @@ define <32 x i64> @vwadd_v32i64(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwadd.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwadd.vv v16, v24, v0 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -351,12 +405,12 @@ define <32 x i64> @vwadd_v32i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwadd_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vsext.vf2 v11, v9 -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf4 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -369,11 +423,12 @@ define <2 x i32> @vwadd_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwadd_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vwadd.vv v8, v10, v9 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -386,11 +441,12 @@ define <4 x i32> @vwadd_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwadd_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v11, v8 -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf8 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -404,8 +460,12 @@ define <2 x i16> @vwadd_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwadd_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -420,8 +480,12 @@ define <4 x i16> @vwadd_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwadd_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -436,8 +500,12 @@ define <2 x i32> @vwadd_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwadd_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -452,8 +520,12 @@ define <8 x i16> @vwadd_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwadd_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -468,8 +540,12 @@ define <4 x i32> @vwadd_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwadd_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -484,8 +560,12 @@ define <2 x i64> @vwadd_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwadd_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwadd.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -501,7 +581,11 @@ define <16 x i16> @vwadd_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwadd.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -517,7 +601,11 @@ define <8 x i32> @vwadd_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwadd.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -533,7 +621,11 @@ define <4 x i64> @vwadd_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwadd.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -550,7 +642,11 @@ define <32 x i16> @vwadd_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwadd.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -566,7 +662,11 @@ define <16 x i32> @vwadd_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwadd.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -582,7 +682,11 @@ define <8 x i64> @vwadd_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwadd.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -599,7 +703,11 @@ define <64 x i16> @vwadd_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwadd.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -616,7 +724,11 @@ define <32 x i32> @vwadd_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwadd.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -632,7 +744,11 @@ define <16 x i64> @vwadd_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwadd.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -646,10 +762,11 @@ define <16 x i64> @vwadd_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwadd_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: lb a0, 0(a1) -; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -664,12 +781,11 @@ define <8 x i16> @vwadd_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwadd_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vwadd.wv v8, v8, v9 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i16, ptr %y @@ -683,10 +799,11 @@ define <8 x i16> @vwadd_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwadd_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lb a0, 0(a1) -; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -701,10 +818,11 @@ define <4 x i32> @vwadd_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwadd_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lh a0, 0(a1) -; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y @@ -719,12 +837,11 @@ define <4 x i32> @vwadd_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vwadd.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i32, ptr %y @@ -738,10 +855,11 @@ define <4 x i32> @vwadd_vx_v4i32_i32(ptr %x, ptr %y) { define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwadd_vx_v2i64_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lb a0, 0(a1) -; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -756,10 +874,11 @@ define <2 x i64> @vwadd_vx_v2i64_i8(ptr %x, ptr %y) nounwind { define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwadd_vx_v2i64_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lh a0, 0(a1) -; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -774,10 +893,11 @@ define <2 x i64> @vwadd_vx_v2i64_i16(ptr %x, ptr %y) nounwind { define <2 x i64> @vwadd_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwadd_vx_v2i64_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lw a0, 0(a1) -; CHECK-NEXT: vwadd.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y @@ -795,24 +915,24 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a2, 0(a1) ; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwadd.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwadd_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwadd.wv v8, v8, v9 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vadd.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i64, ptr %y @@ -826,12 +946,12 @@ define <2 x i64> @vwadd_vx_v2i64_i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwadd_v2i32_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i32_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vsext.vf2 v11, v9 -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf4 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -844,12 +964,12 @@ define <2 x i32> @vwadd_v2i32_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwadd_v2i64_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i64_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vsext.vf4 v10, v8 -; CHECK-NEXT: vsext.vf4 v11, v9 -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vsext.vf8 v10, v8 +; CHECK-NEXT: vsext.vf8 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -862,12 +982,12 @@ define <2 x i64> @vwadd_v2i64_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwadd_v2i64_of_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwadd_v2i64_of_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vsext.vf2 v11, v9 -; CHECK-NEXT: vwadd.vv v8, v10, v11 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf4 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -883,9 +1003,11 @@ define <4 x i32> @vwaddu_vv_disjoint_or_add(<4 x i8> %x.i8, <4 x i8> %y.i8) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vsll.vi v10, v10, 8 -; CHECK-NEXT: vzext.vf2 v11, v9 -; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: vsll.vi v8, v10, 8 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf4 v8, v9 +; CHECK-NEXT: vor.vv v8, v10, v8 ; CHECK-NEXT: ret %x.i16 = zext <4 x i8> %x.i8 to <4 x i16> %x.shl = shl <4 x i16> %x.i16, splat (i16 8) @@ -899,8 +1021,9 @@ define <4 x i32> @vwaddu_vv_disjoint_or(<4 x i16> %x.i16, <4 x i16> %y.i16) { ; CHECK-LABEL: vwaddu_vv_disjoint_or: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vwaddu.vv v10, v8, v9 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vor.vv v9, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: ret %x.i32 = zext <4 x i16> %x.i16 to <4 x i32> %y.i32 = zext <4 x i16> %y.i16 to <4 x i32> @@ -912,8 +1035,9 @@ define <4 x i32> @vwadd_vv_disjoint_or(<4 x i16> %x.i16, <4 x i16> %y.i16) { ; CHECK-LABEL: vwadd_vv_disjoint_or: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vwadd.vv v10, v8, v9 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vor.vv v9, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v9 ; CHECK-NEXT: ret %x.i32 = sext <4 x i16> %x.i16 to <4 x i32> %y.i32 = sext <4 x i16> %y.i16 to <4 x i32> @@ -925,8 +1049,9 @@ define <4 x i32> @vwaddu_vx_disjoint_or(<4 x i16> %x.i16, i16 %y.i16) { ; CHECK-LABEL: vwaddu_vx_disjoint_or: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vwaddu.vx v9, v8, a0 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vor.vx v9, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: ret %x.i32 = zext <4 x i16> %x.i16 to <4 x i32> %y.head = insertelement <4 x i16> poison, i16 %y.i16, i32 0 @@ -940,8 +1065,9 @@ define <4 x i32> @vwadd_vx_disjoint_or(<4 x i16> %x.i16, i16 %y.i16) { ; CHECK-LABEL: vwadd_vx_disjoint_or: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vwadd.vx v9, v8, a0 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vor.vx v9, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v9 ; CHECK-NEXT: ret %x.i32 = sext <4 x i16> %x.i16 to <4 x i32> %y.head = insertelement <4 x i16> poison, i16 %y.i16, i32 0 @@ -954,8 +1080,9 @@ define <4 x i32> @vwadd_vx_disjoint_or(<4 x i16> %x.i16, i16 %y.i16) { define <4 x i32> @vwaddu_wv_disjoint_or(<4 x i32> %x.i32, <4 x i16> %y.i16) { ; CHECK-LABEL: vwaddu_wv_disjoint_or: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vwaddu.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret %y.i32 = zext <4 x i16> %y.i16 to <4 x i32> %or = or disjoint <4 x i32> %x.i32, %y.i32 @@ -965,8 +1092,9 @@ define <4 x i32> @vwaddu_wv_disjoint_or(<4 x i32> %x.i32, <4 x i16> %y.i16) { define <4 x i32> @vwadd_wv_disjoint_or(<4 x i32> %x.i32, <4 x i16> %y.i16) { ; CHECK-LABEL: vwadd_wv_disjoint_or: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vwadd.wv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v9 +; CHECK-NEXT: vor.vv v8, v8, v10 ; CHECK-NEXT: ret %y.i32 = sext <4 x i16> %y.i16 to <4 x i32> %or = or disjoint <4 x i32> %x.i32, %y.i32 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll index 1a716f688dd59..d5ebb9ba1a3ef 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwaddu_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,10 +23,12 @@ define <2 x i16> @vwaddu_v2i16(ptr %x, ptr %y) { define <4 x i16> @vwaddu_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -37,10 +41,12 @@ define <4 x i16> @vwaddu_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwaddu_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -53,10 +59,12 @@ define <2 x i32> @vwaddu_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwaddu_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -69,10 +77,12 @@ define <8 x i16> @vwaddu_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwaddu_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -85,10 +95,12 @@ define <4 x i32> @vwaddu_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwaddu_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwaddu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -101,10 +113,12 @@ define <2 x i64> @vwaddu_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwaddu_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -117,10 +131,12 @@ define <16 x i16> @vwaddu_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwaddu_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -133,10 +149,12 @@ define <8 x i32> @vwaddu_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwaddu_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -150,10 +168,12 @@ define <32 x i16> @vwaddu_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwaddu.vv v8, v12, v14 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -166,10 +186,12 @@ define <32 x i16> @vwaddu_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwaddu_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwaddu.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -182,10 +204,12 @@ define <16 x i32> @vwaddu_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwaddu_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwaddu.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -199,10 +223,12 @@ define <64 x i16> @vwaddu_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwaddu.vv v8, v16, v20 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -216,10 +242,12 @@ define <32 x i32> @vwaddu_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwaddu.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -232,10 +260,12 @@ define <32 x i32> @vwaddu_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwaddu_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwaddu.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -255,18 +285,26 @@ define <128 x i16> @vwaddu_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -290,18 +328,26 @@ define <64 x i32> @vwaddu_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -325,16 +371,24 @@ define <32 x i64> @vwaddu_v32i64(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwaddu.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwaddu.vv v16, v24, v0 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v0 +; CHECK-NEXT: vadd.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -351,12 +405,14 @@ define <32 x i64> @vwaddu_v32i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwaddu_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -369,11 +425,12 @@ define <2 x i32> @vwaddu_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwaddu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vwaddu.vv v8, v10, v9 +; CHECK-NEXT: vzext.vf4 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -386,11 +443,12 @@ define <4 x i32> @vwaddu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwaddu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vzext.vf4 v11, v8 -; CHECK-NEXT: vwaddu.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf8 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -404,8 +462,12 @@ define <2 x i16> @vwaddu_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwaddu_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -420,8 +482,12 @@ define <4 x i16> @vwaddu_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwaddu_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -436,8 +502,12 @@ define <2 x i32> @vwaddu_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwaddu_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -452,8 +522,12 @@ define <8 x i16> @vwaddu_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwaddu_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -468,8 +542,12 @@ define <4 x i32> @vwaddu_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwaddu_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -484,8 +562,12 @@ define <2 x i64> @vwaddu_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwaddu_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwaddu.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -501,7 +583,11 @@ define <16 x i16> @vwaddu_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwaddu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -517,7 +603,11 @@ define <8 x i32> @vwaddu_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwaddu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -533,7 +623,11 @@ define <4 x i64> @vwaddu_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwaddu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -550,7 +644,11 @@ define <32 x i16> @vwaddu_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwaddu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -566,7 +664,11 @@ define <16 x i32> @vwaddu_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwaddu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -582,7 +684,11 @@ define <8 x i64> @vwaddu_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwaddu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vadd.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -599,7 +705,11 @@ define <64 x i16> @vwaddu_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwaddu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -616,7 +726,11 @@ define <32 x i32> @vwaddu_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwaddu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -632,7 +746,11 @@ define <16 x i64> @vwaddu_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwaddu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -646,10 +764,11 @@ define <16 x i64> @vwaddu_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwaddu_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: lbu a0, 0(a1) -; CHECK-NEXT: vwaddu.vx v8, v9, a0 +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -664,12 +783,11 @@ define <8 x i16> @vwaddu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwaddu_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vwaddu.wv v8, v8, v9 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i16, ptr %y @@ -683,10 +801,11 @@ define <8 x i16> @vwaddu_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwaddu_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lbu a0, 0(a1) -; CHECK-NEXT: vwaddu.vx v8, v9, a0 +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -701,10 +820,11 @@ define <4 x i32> @vwaddu_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwaddu_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lhu a0, 0(a1) -; CHECK-NEXT: vwaddu.vx v8, v9, a0 +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y @@ -719,12 +839,11 @@ define <4 x i32> @vwaddu_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwaddu_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vwaddu.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vadd.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i32, ptr %y @@ -740,22 +859,24 @@ define <2 x i64> @vwaddu_vx_v2i64_i8(ptr %x, ptr %y) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwaddu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwaddu_vx_v2i64_i8: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lbu a0, 0(a1) -; RV64-NEXT: vwaddu.vx v8, v9, a0 +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -772,22 +893,24 @@ define <2 x i64> @vwaddu_vx_v2i64_i16(ptr %x, ptr %y) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lhu a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwaddu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwaddu_vx_v2i64_i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lhu a0, 0(a1) -; RV64-NEXT: vwaddu.vx v8, v9, a0 +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -804,22 +927,24 @@ define <2 x i64> @vwaddu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwaddu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwaddu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lwu a0, 0(a1) -; RV64-NEXT: vwaddu.vx v8, v9, a0 +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y @@ -837,24 +962,24 @@ define <2 x i64> @vwaddu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a2, 0(a1) ; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwaddu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vadd.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwaddu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwaddu.wv v8, v8, v9 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vadd.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i64, ptr %y @@ -869,12 +994,9 @@ define <4 x i64> @crash(<4 x i16> %x, <4 x i16> %y) { ; CHECK-LABEL: crash: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v9 -; CHECK-NEXT: vmv1r.v v11, v8 -; CHECK-NEXT: vsext.vf4 v8, v11 -; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vzext.vf2 v11, v10 -; CHECK-NEXT: vwaddu.wv v8, v8, v11 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vzext.vf4 v12, v9 +; CHECK-NEXT: vadd.vv v8, v10, v12 ; CHECK-NEXT: ret %a = sext <4 x i16> %x to <4 x i64> %b = zext <4 x i16> %y to <4 x i64> @@ -885,12 +1007,14 @@ define <4 x i64> @crash(<4 x i16> %x, <4 x i16> %y) { define <2 x i32> @vwaddu_v2i32_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i32_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -903,12 +1027,14 @@ define <2 x i32> @vwaddu_v2i32_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwaddu_v2i64_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i64_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vzext.vf4 v8, v10 +; CHECK-NEXT: vzext.vf4 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -921,12 +1047,14 @@ define <2 x i64> @vwaddu_v2i64_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwaddu_v2i64_of_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwaddu_v2i64_of_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vwaddu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll index 7791163b3d1f6..0182c7cb26f58 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmacc.ll @@ -5,10 +5,13 @@ define <2 x i16> @vwmacc_v2i16(ptr %x, ptr %y, <2 x i16> %z) { ; CHECK-LABEL: vwmacc_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -22,10 +25,13 @@ define <2 x i16> @vwmacc_v2i16(ptr %x, ptr %y, <2 x i16> %z) { define <4 x i16> @vwmacc_v4i16(ptr %x, ptr %y, <4 x i16> %z) { ; CHECK-LABEL: vwmacc_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -39,10 +45,13 @@ define <4 x i16> @vwmacc_v4i16(ptr %x, ptr %y, <4 x i16> %z) { define <2 x i32> @vwmacc_v2i32(ptr %x, ptr %y, <2 x i32> %z) { ; CHECK-LABEL: vwmacc_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -56,10 +65,13 @@ define <2 x i32> @vwmacc_v2i32(ptr %x, ptr %y, <2 x i32> %z) { define <8 x i16> @vwmacc_v8i16(ptr %x, ptr %y, <8 x i16> %z) { ; CHECK-LABEL: vwmacc_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -73,10 +85,13 @@ define <8 x i16> @vwmacc_v8i16(ptr %x, ptr %y, <8 x i16> %z) { define <4 x i32> @vwmacc_v4i32(ptr %x, ptr %y, <4 x i32> %z) { ; CHECK-LABEL: vwmacc_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -90,10 +105,13 @@ define <4 x i32> @vwmacc_v4i32(ptr %x, ptr %y, <4 x i32> %z) { define <2 x i64> @vwmacc_v2i64(ptr %x, ptr %y, <2 x i64> %z) { ; CHECK-LABEL: vwmacc_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmacc.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -107,10 +125,13 @@ define <2 x i64> @vwmacc_v2i64(ptr %x, ptr %y, <2 x i64> %z) { define <16 x i16> @vwmacc_v16i16(ptr %x, ptr %y, <16 x i16> %z) { ; CHECK-LABEL: vwmacc_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmacc.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -124,10 +145,13 @@ define <16 x i16> @vwmacc_v16i16(ptr %x, ptr %y, <16 x i16> %z) { define <8 x i32> @vwmacc_v8i32(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: vwmacc_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmacc.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -141,10 +165,13 @@ define <8 x i32> @vwmacc_v8i32(ptr %x, ptr %y, <8 x i32> %z) { define <4 x i64> @vwmacc_v4i64(ptr %x, ptr %y, <4 x i64> %z) { ; CHECK-LABEL: vwmacc_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmacc.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -159,10 +186,13 @@ define <32 x i16> @vwmacc_v32i16(ptr %x, ptr %y, <32 x i16> %z) { ; CHECK-LABEL: vwmacc_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmacc.vv v8, v12, v14 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -176,10 +206,13 @@ define <32 x i16> @vwmacc_v32i16(ptr %x, ptr %y, <32 x i16> %z) { define <16 x i32> @vwmacc_v16i32(ptr %x, ptr %y, <16 x i32> %z) { ; CHECK-LABEL: vwmacc_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmacc.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -193,10 +226,13 @@ define <16 x i32> @vwmacc_v16i32(ptr %x, ptr %y, <16 x i32> %z) { define <8 x i64> @vwmacc_v8i64(ptr %x, ptr %y, <8 x i64> %z) { ; CHECK-LABEL: vwmacc_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmacc.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -211,10 +247,13 @@ define <64 x i16> @vwmacc_v64i16(ptr %x, ptr %y, <64 x i16> %z) { ; CHECK-LABEL: vwmacc_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmacc.vv v8, v16, v20 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vle8.v v4, (a1) +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -229,10 +268,13 @@ define <32 x i32> @vwmacc_v32i32(ptr %x, ptr %y, <32 x i32> %z) { ; CHECK-LABEL: vwmacc_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmacc.vv v8, v16, v20 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vle16.v v4, (a1) +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -246,10 +288,13 @@ define <32 x i32> @vwmacc_v32i32(ptr %x, ptr %y, <32 x i32> %z) { define <16 x i64> @vwmacc_v16i64(ptr %x, ptr %y, <16 x i64> %z) { ; CHECK-LABEL: vwmacc_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmacc.vv v8, v16, v20 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vle32.v v4, (a1) +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -265,7 +310,12 @@ define <2 x i16> @vwmacc_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -282,7 +332,12 @@ define <4 x i16> @vwmacc_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -299,7 +354,12 @@ define <2 x i32> @vwmacc_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -316,7 +376,12 @@ define <8 x i16> @vwmacc_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -333,7 +398,12 @@ define <4 x i32> @vwmacc_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -350,7 +420,12 @@ define <2 x i64> @vwmacc_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -366,8 +441,13 @@ define <16 x i16> @vwmacc_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmacc_vx_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v10 +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -383,8 +463,13 @@ define <8 x i32> @vwmacc_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmacc_vx_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v10 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -400,8 +485,13 @@ define <4 x i64> @vwmacc_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v10 +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -418,8 +508,13 @@ define <32 x i16> @vwmacc_vx_v32i16(ptr %x, i8 %y, <32 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v12 +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -435,8 +530,13 @@ define <16 x i32> @vwmacc_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmacc_vx_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v12 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -452,8 +552,13 @@ define <8 x i64> @vwmacc_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v12 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -470,8 +575,13 @@ define <64 x i16> @vwmacc_vx_v64i16(ptr %x, i8 %y, <64 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v16 +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -488,8 +598,13 @@ define <32 x i32> @vwmacc_vx_v32i32(ptr %x, i16 %y, <32 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v16 +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -505,8 +620,13 @@ define <16 x i64> @vwmacc_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmacc_vx_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmacc.vx v8, a1, v16 +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll index cf76c980c6de6..d284096062a0c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccsu.ll @@ -5,10 +5,13 @@ define <2 x i16> @vwmaccsu_v2i16(ptr %x, ptr %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -22,10 +25,13 @@ define <2 x i16> @vwmaccsu_v2i16(ptr %x, ptr %y, <2 x i16> %z) { define <4 x i16> @vwmaccsu_v4i16(ptr %x, ptr %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -39,10 +45,13 @@ define <4 x i16> @vwmaccsu_v4i16(ptr %x, ptr %y, <4 x i16> %z) { define <2 x i32> @vwmaccsu_v2i32(ptr %x, ptr %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -56,10 +65,13 @@ define <2 x i32> @vwmaccsu_v2i32(ptr %x, ptr %y, <2 x i32> %z) { define <8 x i16> @vwmaccsu_v8i16(ptr %x, ptr %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -73,10 +85,13 @@ define <8 x i16> @vwmaccsu_v8i16(ptr %x, ptr %y, <8 x i16> %z) { define <4 x i32> @vwmaccsu_v4i32(ptr %x, ptr %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -90,10 +105,13 @@ define <4 x i32> @vwmaccsu_v4i32(ptr %x, ptr %y, <4 x i32> %z) { define <2 x i64> @vwmaccsu_v2i64(ptr %x, ptr %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v9, v10 +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -107,10 +125,13 @@ define <2 x i64> @vwmaccsu_v2i64(ptr %x, ptr %y, <2 x i64> %z) { define <16 x i16> @vwmaccsu_v16i16(ptr %x, ptr %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -124,10 +145,13 @@ define <16 x i16> @vwmaccsu_v16i16(ptr %x, ptr %y, <16 x i16> %z) { define <8 x i32> @vwmaccsu_v8i32(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -141,10 +165,13 @@ define <8 x i32> @vwmaccsu_v8i32(ptr %x, ptr %y, <8 x i32> %z) { define <4 x i64> @vwmaccsu_v4i64(ptr %x, ptr %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -159,10 +186,13 @@ define <32 x i16> @vwmaccsu_v32i16(ptr %x, ptr %y, <32 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v12, v14 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -176,10 +206,13 @@ define <32 x i16> @vwmaccsu_v32i16(ptr %x, ptr %y, <32 x i16> %z) { define <16 x i32> @vwmaccsu_v16i32(ptr %x, ptr %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -193,10 +226,13 @@ define <16 x i32> @vwmaccsu_v16i32(ptr %x, ptr %y, <16 x i32> %z) { define <8 x i64> @vwmaccsu_v8i64(ptr %x, ptr %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -211,10 +247,13 @@ define <64 x i16> @vwmaccsu_v64i16(ptr %x, ptr %y, <64 x i16> %z) { ; CHECK-LABEL: vwmaccsu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v16, v20 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vle8.v v4, (a1) +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -229,10 +268,13 @@ define <32 x i32> @vwmaccsu_v32i32(ptr %x, ptr %y, <32 x i32> %z) { ; CHECK-LABEL: vwmaccsu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v16, v20 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vle16.v v4, (a1) +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -246,10 +288,13 @@ define <32 x i32> @vwmaccsu_v32i32(ptr %x, ptr %y, <32 x i32> %z) { define <16 x i64> @vwmaccsu_v16i64(ptr %x, ptr %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccsu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmaccsu.vv v8, v16, v20 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vle32.v v4, (a1) +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -265,7 +310,12 @@ define <2 x i16> @vwmaccsu_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -282,7 +332,12 @@ define <4 x i16> @vwmaccsu_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -299,7 +354,12 @@ define <2 x i32> @vwmaccsu_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -316,7 +376,12 @@ define <8 x i16> @vwmaccsu_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -333,7 +398,12 @@ define <4 x i32> @vwmaccsu_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -350,7 +420,12 @@ define <2 x i64> @vwmaccsu_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -366,8 +441,13 @@ define <16 x i16> @vwmaccsu_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccsu_vx_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v10 +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -383,8 +463,13 @@ define <8 x i32> @vwmaccsu_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccsu_vx_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v10 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -400,8 +485,13 @@ define <4 x i64> @vwmaccsu_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v10 +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -418,8 +508,13 @@ define <32 x i16> @vwmaccsu_vx_v32i16(ptr %x, i8 %y, <32 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v12 +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -435,8 +530,13 @@ define <16 x i32> @vwmaccsu_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccsu_vx_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v12 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -452,8 +552,13 @@ define <8 x i64> @vwmaccsu_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v12 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -470,8 +575,13 @@ define <64 x i16> @vwmaccsu_vx_v64i16(ptr %x, i8 %y, <64 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v16 +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -488,8 +598,13 @@ define <32 x i32> @vwmaccsu_vx_v32i32(ptr %x, i16 %y, <32 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v16 +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -505,8 +620,13 @@ define <16 x i64> @vwmaccsu_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccsu_vx_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmaccsu.vx v8, a1, v16 +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll index 81137e6dc768b..ef00892da05f6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccu.ll @@ -5,10 +5,13 @@ define <2 x i16> @vwmaccu_v2i16(ptr %x, ptr %y, <2 x i16> %z) { ; CHECK-LABEL: vwmaccu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -22,10 +25,13 @@ define <2 x i16> @vwmaccu_v2i16(ptr %x, ptr %y, <2 x i16> %z) { define <4 x i16> @vwmaccu_v4i16(ptr %x, ptr %y, <4 x i16> %z) { ; CHECK-LABEL: vwmaccu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -39,10 +45,13 @@ define <4 x i16> @vwmaccu_v4i16(ptr %x, ptr %y, <4 x i16> %z) { define <2 x i32> @vwmaccu_v2i32(ptr %x, ptr %y, <2 x i32> %z) { ; CHECK-LABEL: vwmaccu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -56,10 +65,13 @@ define <2 x i32> @vwmaccu_v2i32(ptr %x, ptr %y, <2 x i32> %z) { define <8 x i16> @vwmaccu_v8i16(ptr %x, ptr %y, <8 x i16> %z) { ; CHECK-LABEL: vwmaccu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) ; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -73,10 +85,13 @@ define <8 x i16> @vwmaccu_v8i16(ptr %x, ptr %y, <8 x i16> %z) { define <4 x i32> @vwmaccu_v4i32(ptr %x, ptr %y, <4 x i32> %z) { ; CHECK-LABEL: vwmaccu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) ; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -90,10 +105,13 @@ define <4 x i32> @vwmaccu_v4i32(ptr %x, ptr %y, <4 x i32> %z) { define <2 x i64> @vwmaccu_v2i64(ptr %x, ptr %y, <2 x i64> %z) { ; CHECK-LABEL: vwmaccu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -107,10 +125,13 @@ define <2 x i64> @vwmaccu_v2i64(ptr %x, ptr %y, <2 x i64> %z) { define <16 x i16> @vwmaccu_v16i16(ptr %x, ptr %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vle8.v v14, (a1) +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -124,10 +145,13 @@ define <16 x i16> @vwmaccu_v16i16(ptr %x, ptr %y, <16 x i16> %z) { define <8 x i32> @vwmaccu_v8i32(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vle16.v v14, (a1) +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -141,10 +165,13 @@ define <8 x i32> @vwmaccu_v8i32(ptr %x, ptr %y, <8 x i32> %z) { define <4 x i64> @vwmaccu_v4i64(ptr %x, ptr %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v10, v11 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vle32.v v14, (a1) +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -159,10 +186,13 @@ define <32 x i16> @vwmaccu_v32i16(ptr %x, ptr %y, <32 x i16> %z) { ; CHECK-LABEL: vwmaccu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v12, v14 +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vle8.v v20, (a1) +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -176,10 +206,13 @@ define <32 x i16> @vwmaccu_v32i16(ptr %x, ptr %y, <32 x i16> %z) { define <16 x i32> @vwmaccu_v16i32(ptr %x, ptr %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vle16.v v20, (a1) +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -193,10 +226,13 @@ define <16 x i32> @vwmaccu_v16i32(ptr %x, ptr %y, <16 x i32> %z) { define <8 x i64> @vwmaccu_v8i64(ptr %x, ptr %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vle32.v v20, (a1) +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -211,10 +247,13 @@ define <64 x i16> @vwmaccu_v64i16(ptr %x, ptr %y, <64 x i16> %z) { ; CHECK-LABEL: vwmaccu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v16, v20 +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vle8.v v4, (a1) +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -229,10 +268,13 @@ define <32 x i32> @vwmaccu_v32i32(ptr %x, ptr %y, <32 x i32> %z) { ; CHECK-LABEL: vwmaccu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v16, v20 +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vle16.v v4, (a1) +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -246,10 +288,13 @@ define <32 x i32> @vwmaccu_v32i32(ptr %x, ptr %y, <32 x i32> %z) { define <16 x i64> @vwmaccu_v16i64(ptr %x, ptr %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmaccu.vv v8, v16, v20 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vle32.v v4, (a1) +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -265,7 +310,12 @@ define <2 x i16> @vwmaccu_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -282,7 +332,12 @@ define <4 x i16> @vwmaccu_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -299,7 +354,12 @@ define <2 x i32> @vwmaccu_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -316,7 +376,12 @@ define <8 x i16> @vwmaccu_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -333,7 +398,12 @@ define <4 x i32> @vwmaccu_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -350,7 +420,12 @@ define <2 x i64> @vwmaccu_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -366,8 +441,13 @@ define <16 x i16> @vwmaccu_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccu_vx_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v10 +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -383,8 +463,13 @@ define <8 x i32> @vwmaccu_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccu_vx_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v10 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -400,8 +485,13 @@ define <4 x i64> @vwmaccu_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v10 +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -418,8 +508,13 @@ define <32 x i16> @vwmaccu_vx_v32i16(ptr %x, i8 %y, <32 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v12 +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -435,8 +530,13 @@ define <16 x i32> @vwmaccu_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccu_vx_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v12 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -452,8 +552,13 @@ define <8 x i64> @vwmaccu_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v12 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -470,8 +575,13 @@ define <64 x i16> @vwmaccu_vx_v64i16(ptr %x, i8 %y, <64 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v16 +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -488,8 +598,13 @@ define <32 x i32> @vwmaccu_vx_v32i32(ptr %x, i16 %y, <32 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v16 +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -505,8 +620,13 @@ define <16 x i64> @vwmaccu_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccu_vx_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmaccu.vx v8, a1, v16 +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll index d4b5fe98a6995..96766bc84c9a6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmaccus.ll @@ -7,7 +7,12 @@ define <2 x i16> @vwmaccus_vx_v2i16(ptr %x, i8 %y, <2 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -24,7 +29,12 @@ define <4 x i16> @vwmaccus_vx_v4i16(ptr %x, i8 %y, <4 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -41,7 +51,12 @@ define <2 x i32> @vwmaccus_vx_v2i32(ptr %x, i16 %y, <2 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -58,7 +73,12 @@ define <8 x i16> @vwmaccus_vx_v8i16(ptr %x, i8 %y, <8 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -75,7 +95,12 @@ define <4 x i32> @vwmaccus_vx_v4i32(ptr %x, i16 %y, <4 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -92,7 +117,12 @@ define <2 x i64> @vwmaccus_vx_v2i64(ptr %x, i32 %y, <2 x i64> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v9 +; CHECK-NEXT: vmv.v.x v10, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v11, v9 +; CHECK-NEXT: vzext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -108,8 +138,13 @@ define <16 x i16> @vwmaccus_vx_v16i16(ptr %x, i8 %y, <16 x i16> %z) { ; CHECK-LABEL: vwmaccus_vx_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v10 +; CHECK-NEXT: vle8.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -125,8 +160,13 @@ define <8 x i32> @vwmaccus_vx_v8i32(ptr %x, i16 %y, <8 x i32> %z) { ; CHECK-LABEL: vwmaccus_vx_v8i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v10 +; CHECK-NEXT: vle16.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -142,8 +182,13 @@ define <4 x i64> @vwmaccus_vx_v4i64(ptr %x, i32 %y, <4 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v4i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v10 +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vmv.v.x v14, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vzext.vf2 v12, v14 +; CHECK-NEXT: vmul.vv v10, v10, v12 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -160,8 +205,13 @@ define <32 x i16> @vwmaccus_vx_v32i16(ptr %x, i8 %y, <32 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v12 +; CHECK-NEXT: vle8.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -177,8 +227,13 @@ define <16 x i32> @vwmaccus_vx_v16i32(ptr %x, i16 %y, <16 x i32> %z) { ; CHECK-LABEL: vwmaccus_vx_v16i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v12 +; CHECK-NEXT: vle16.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -194,8 +249,13 @@ define <8 x i64> @vwmaccus_vx_v8i64(ptr %x, i32 %y, <8 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v8i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v12 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vmv.v.x v20, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vzext.vf2 v16, v20 +; CHECK-NEXT: vmul.vv v12, v12, v16 +; CHECK-NEXT: vadd.vv v8, v12, v8 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -212,8 +272,13 @@ define <64 x i16> @vwmaccus_vx_v64i16(ptr %x, i8 %y, <64 x i16> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v16 +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -230,8 +295,13 @@ define <32 x i32> @vwmaccus_vx_v32i32(ptr %x, i16 %y, <32 x i32> %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v16 +; CHECK-NEXT: vle16.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -247,8 +317,13 @@ define <16 x i64> @vwmaccus_vx_v16i64(ptr %x, i32 %y, <16 x i64> %z) { ; CHECK-LABEL: vwmaccus_vx_v16i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmaccus.vx v8, a1, v16 +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: vmv.v.x v4, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vzext.vf2 v24, v4 +; CHECK-NEXT: vmul.vv v16, v16, v24 +; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll index 94c3138fd330b..4f7d2982f2066 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwmul_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmul.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,14 +23,16 @@ define <2 x i16> @vwmul_v2i16(ptr %x, ptr %y) { define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { ; CHECK-LABEL: vwmul_v2i16_multiple_users: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) ; CHECK-NEXT: vle8.v v10, (a2) -; CHECK-NEXT: vwmul.vv v11, v8, v9 -; CHECK-NEXT: vwmul.vv v9, v8, v10 -; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-NEXT: vor.vv v8, v11, v9 +; CHECK-NEXT: vsext.vf2 v11, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsext.vf2 v9, v10 +; CHECK-NEXT: vmul.vv v8, v11, v8 +; CHECK-NEXT: vmul.vv v9, v11, v9 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -45,10 +49,12 @@ define <2 x i16> @vwmul_v2i16_multiple_users(ptr %x, ptr %y, ptr %z) { define <4 x i16> @vwmul_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmul.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -61,10 +67,12 @@ define <4 x i16> @vwmul_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwmul_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmul.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -77,10 +85,12 @@ define <2 x i32> @vwmul_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwmul_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmul.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -93,10 +103,12 @@ define <8 x i16> @vwmul_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwmul_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmul.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -109,10 +121,12 @@ define <4 x i32> @vwmul_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwmul_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmul.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -125,10 +139,12 @@ define <2 x i64> @vwmul_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwmul_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmul.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -141,10 +157,12 @@ define <16 x i16> @vwmul_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwmul_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmul.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -157,10 +175,12 @@ define <8 x i32> @vwmul_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwmul_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmul.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -174,10 +194,12 @@ define <32 x i16> @vwmul_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmul.vv v8, v12, v14 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -190,10 +212,12 @@ define <32 x i16> @vwmul_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwmul_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmul.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -206,10 +230,12 @@ define <16 x i32> @vwmul_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwmul_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmul.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -223,10 +249,12 @@ define <64 x i16> @vwmul_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmul.vv v8, v16, v20 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -240,10 +268,12 @@ define <32 x i32> @vwmul_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmul.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -256,10 +286,12 @@ define <32 x i32> @vwmul_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwmul_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmul.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -281,18 +313,26 @@ define <128 x i16> @vwmul_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -320,18 +360,26 @@ define <64 x i32> @vwmul_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwmul.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -359,16 +407,24 @@ define <32 x i64> @vwmul_v32i64(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwmul.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmul.vv v16, v24, v0 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -387,12 +443,12 @@ define <32 x i64> @vwmul_v32i64(ptr %x, ptr %y) { define <2 x i32> @vwmul_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vsext.vf2 v11, v9 -; CHECK-NEXT: vwmul.vv v8, v10, v11 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf4 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -405,11 +461,12 @@ define <2 x i32> @vwmul_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwmul_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vwmul.vv v8, v10, v9 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -422,11 +479,12 @@ define <4 x i32> @vwmul_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwmul_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v11, v8 -; CHECK-NEXT: vwmul.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf8 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -440,8 +498,12 @@ define <2 x i16> @vwmul_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmul_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -456,8 +518,12 @@ define <4 x i16> @vwmul_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmul_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -472,8 +538,12 @@ define <2 x i32> @vwmul_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwmul_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -488,8 +558,12 @@ define <8 x i16> @vwmul_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmul_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -504,8 +578,12 @@ define <4 x i32> @vwmul_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwmul_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -520,8 +598,12 @@ define <2 x i64> @vwmul_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwmul_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmul.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -537,7 +619,11 @@ define <16 x i16> @vwmul_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmul.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -553,7 +639,11 @@ define <8 x i32> @vwmul_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmul.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -569,7 +659,11 @@ define <4 x i64> @vwmul_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmul.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -586,7 +680,11 @@ define <32 x i16> @vwmul_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmul.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -602,7 +700,11 @@ define <16 x i32> @vwmul_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmul.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -618,7 +720,11 @@ define <8 x i64> @vwmul_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmul.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -635,7 +741,11 @@ define <64 x i16> @vwmul_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmul.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -652,7 +762,11 @@ define <32 x i32> @vwmul_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmul.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -668,7 +782,11 @@ define <16 x i64> @vwmul_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmul.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -682,10 +800,11 @@ define <16 x i64> @vwmul_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwmul_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: lb a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -718,10 +837,11 @@ define <8 x i16> @vwmul_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwmul_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lb a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -736,10 +856,11 @@ define <4 x i32> @vwmul_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwmul_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lh a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y @@ -772,10 +893,11 @@ define <4 x i32> @vwmul_vx_v4i32_i32(ptr %x, ptr %y) { define <2 x i64> @vwmul_vx_v2i64_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_vx_v2i64_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lb a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -790,10 +912,11 @@ define <2 x i64> @vwmul_vx_v2i64_i8(ptr %x, ptr %y) { define <2 x i64> @vwmul_vx_v2i64_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_vx_v2i64_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lh a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -808,10 +931,11 @@ define <2 x i64> @vwmul_vx_v2i64_i16(ptr %x, ptr %y) { define <2 x i64> @vwmul_vx_v2i64_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmul_vx_v2i64_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: lw a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll index 8ebd93e9dc637..6f76d5f920009 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwmulsu_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,10 +23,12 @@ define <2 x i16> @vwmulsu_v2i16(ptr %x, ptr %y) { define <2 x i16> @vwmulsu_v2i16_swap(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v2i16_swap: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -37,10 +41,12 @@ define <2 x i16> @vwmulsu_v2i16_swap(ptr %x, ptr %y) { define <4 x i16> @vwmulsu_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -53,10 +59,12 @@ define <4 x i16> @vwmulsu_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwmulsu_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -69,10 +77,12 @@ define <2 x i32> @vwmulsu_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwmulsu_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -85,10 +95,12 @@ define <8 x i16> @vwmulsu_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwmulsu_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -101,10 +113,12 @@ define <4 x i32> @vwmulsu_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwmulsu_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -117,10 +131,12 @@ define <2 x i64> @vwmulsu_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwmulsu_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v11, v10 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -133,10 +149,12 @@ define <16 x i16> @vwmulsu_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwmulsu_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v11, v10 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -149,10 +167,12 @@ define <8 x i32> @vwmulsu_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwmulsu_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v11, v10 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -166,10 +186,12 @@ define <32 x i16> @vwmulsu_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v14, v12 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -182,10 +204,12 @@ define <32 x i16> @vwmulsu_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwmulsu_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v14, v12 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -198,10 +222,12 @@ define <16 x i32> @vwmulsu_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwmulsu_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v14, v12 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -215,10 +241,12 @@ define <64 x i16> @vwmulsu_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v20, v16 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -232,10 +260,12 @@ define <32 x i32> @vwmulsu_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v20, v16 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -248,10 +278,12 @@ define <32 x i32> @vwmulsu_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwmulsu_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmulsu.vv v8, v20, v16 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -273,18 +305,26 @@ define <128 x i16> @vwmulsu_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -312,18 +352,26 @@ define <64 x i32> @vwmulsu_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwmulsu.vv v8, v24, v16 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -351,16 +399,24 @@ define <32 x i64> @vwmulsu_v32i64(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwmulsu.vv v8, v24, v16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulsu.vv v16, v0, v24 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -379,12 +435,12 @@ define <32 x i64> @vwmulsu_v32i64(ptr %x, ptr %y) { define <2 x i32> @vwmulsu_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vsext.vf2 v11, v9 -; CHECK-NEXT: vwmulsu.vv v8, v11, v10 +; CHECK-NEXT: vzext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf4 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -397,11 +453,12 @@ define <2 x i32> @vwmulsu_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwmulsu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vwmulsu.vv v8, v9, v10 +; CHECK-NEXT: vzext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -414,11 +471,12 @@ define <4 x i32> @vwmulsu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwmulsu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v11, v8 -; CHECK-NEXT: vwmulsu.vv v8, v11, v10 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf8 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -432,8 +490,12 @@ define <2 x i16> @vwmulsu_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulsu_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -448,9 +510,12 @@ define <2 x i16> @vwmulsu_vx_v2i16_swap(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulsu_vx_v2i16_swap: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -465,8 +530,12 @@ define <4 x i16> @vwmulsu_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulsu_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -481,8 +550,12 @@ define <2 x i32> @vwmulsu_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwmulsu_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -497,8 +570,12 @@ define <8 x i16> @vwmulsu_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -513,8 +590,12 @@ define <4 x i32> @vwmulsu_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwmulsu_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -529,8 +610,12 @@ define <2 x i64> @vwmulsu_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwmulsu_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -546,7 +631,11 @@ define <16 x i16> @vwmulsu_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -562,7 +651,11 @@ define <8 x i32> @vwmulsu_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -578,7 +671,11 @@ define <4 x i64> @vwmulsu_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -595,7 +692,11 @@ define <32 x i16> @vwmulsu_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -611,7 +712,11 @@ define <16 x i32> @vwmulsu_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -627,7 +732,11 @@ define <8 x i64> @vwmulsu_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -644,7 +753,11 @@ define <64 x i16> @vwmulsu_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -661,7 +774,11 @@ define <32 x i32> @vwmulsu_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -677,7 +794,11 @@ define <16 x i64> @vwmulsu_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -691,10 +812,11 @@ define <16 x i64> @vwmulsu_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwmulsu_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: lbu a0, 0(a1) -; CHECK-NEXT: vwmulsu.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -709,11 +831,11 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwmulsu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -728,10 +850,11 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(ptr %x, ptr %y) { define <4 x i32> @vwmulsu_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lbu a0, 0(a1) -; CHECK-NEXT: vwmul.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -746,10 +869,11 @@ define <4 x i32> @vwmulsu_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwmulsu_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulsu_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lhu a0, 0(a1) -; CHECK-NEXT: vwmulsu.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y @@ -781,10 +905,11 @@ define <2 x i64> @vwmulsu_vx_v2i64_i8(ptr %x, ptr %y) { ; ; RV64-LABEL: vwmulsu_vx_v2i64_i8: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lbu a0, 0(a1) -; RV64-NEXT: vwmul.vx v8, v9, a0 +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmul.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -816,10 +941,11 @@ define <2 x i64> @vwmulsu_vx_v2i64_i16(ptr %x, ptr %y) { ; ; RV64-LABEL: vwmulsu_vx_v2i64_i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lhu a0, 0(a1) -; RV64-NEXT: vwmul.vx v8, v9, a0 +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmul.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -851,10 +977,11 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) { ; ; RV64-LABEL: vwmulsu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: lwu a0, 0(a1) -; RV64-NEXT: vwmulsu.vx v8, v9, a0 +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmul.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y @@ -869,9 +996,11 @@ define <2 x i64> @vwmulsu_vx_v2i64_i32(ptr %x, ptr %y) { define <8 x i16> @vwmulsu_vx_v8i16_i8_and(ptr %x, i16 %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: zext.b a0, a1 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = and i16 %y, 255 @@ -885,10 +1014,11 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8_and(ptr %x, i16 %y) { define <8 x i16> @vwmulsu_vx_v8i16_i8_and1(ptr %x, i16 %y) { ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: andi a0, a1, 254 -; CHECK-NEXT: vwmulsu.vx v8, v9, a0 +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = and i16 %y, 254 @@ -900,12 +1030,25 @@ define <8 x i16> @vwmulsu_vx_v8i16_i8_and1(ptr %x, i16 %y) { } define <4 x i32> @vwmulsu_vx_v4i32_i16_and(ptr %x, i32 %y) { -; CHECK-LABEL: vwmulsu_vx_v4i32_i16_and: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v4i32_i16_and: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: vsext.vf2 v9, v8 +; RV32-NEXT: vmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v4i32_i16_and: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: slli a1, a1, 48 +; RV64-NEXT: srli a1, a1, 48 +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmul.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = and i32 %y, 65535 %c = insertelement <4 x i32> poison, i32 %b, i32 0 @@ -916,12 +1059,25 @@ define <4 x i32> @vwmulsu_vx_v4i32_i16_and(ptr %x, i32 %y) { } define <4 x i32> @vwmulsu_vx_v4i32_i16_zext(ptr %x, i16 %y) { -; CHECK-LABEL: vwmulsu_vx_v4i32_i16_zext: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulsu.vx v8, v9, a1 -; CHECK-NEXT: ret +; RV32-LABEL: vwmulsu_vx_v4i32_i16_zext: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: slli a1, a1, 16 +; RV32-NEXT: srli a1, a1, 16 +; RV32-NEXT: vsext.vf2 v9, v8 +; RV32-NEXT: vmul.vx v8, v9, a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vwmulsu_vx_v4i32_i16_zext: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vle16.v v8, (a0) +; RV64-NEXT: slli a1, a1, 48 +; RV64-NEXT: srli a1, a1, 48 +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vmul.vx v8, v9, a1 +; RV64-NEXT: ret %a = load <4 x i16>, ptr %x %b = zext i16 %y to i32 %c = insertelement <4 x i32> poison, i32 %b, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index 90e9ffdcb320a..6f1c26e332806 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwmulu_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,10 +23,12 @@ define <2 x i16> @vwmulu_v2i16(ptr %x, ptr %y) { define <4 x i16> @vwmulu_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -37,10 +41,12 @@ define <4 x i16> @vwmulu_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwmulu_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmulu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -53,10 +59,12 @@ define <2 x i32> @vwmulu_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwmulu_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwmulu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -69,10 +77,12 @@ define <8 x i16> @vwmulu_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwmulu_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwmulu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -85,10 +95,12 @@ define <4 x i32> @vwmulu_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwmulu_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwmulu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -101,10 +113,12 @@ define <2 x i64> @vwmulu_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwmulu_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwmulu.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -117,10 +131,12 @@ define <16 x i16> @vwmulu_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwmulu_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwmulu.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -133,10 +149,12 @@ define <8 x i32> @vwmulu_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwmulu_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwmulu.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -150,10 +168,12 @@ define <32 x i16> @vwmulu_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwmulu.vv v8, v12, v14 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -166,10 +186,12 @@ define <32 x i16> @vwmulu_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwmulu_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwmulu.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -182,10 +204,12 @@ define <16 x i32> @vwmulu_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwmulu_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwmulu.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -199,10 +223,12 @@ define <64 x i16> @vwmulu_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwmulu.vv v8, v16, v20 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -216,10 +242,12 @@ define <32 x i32> @vwmulu_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwmulu.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -232,10 +260,12 @@ define <32 x i32> @vwmulu_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwmulu_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwmulu.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -257,18 +287,26 @@ define <128 x i16> @vwmulu_v128i16(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -296,18 +334,26 @@ define <64 x i32> @vwmulu_v64i32(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwmulu.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -335,16 +381,24 @@ define <32 x i64> @vwmulu_v32i64(ptr %x, ptr %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwmulu.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwmulu.vv v16, v24, v0 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vmul.vv v8, v8, v0 +; CHECK-NEXT: vmul.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -363,12 +417,14 @@ define <32 x i64> @vwmulu_v32i64(ptr %x, ptr %y) { define <2 x i32> @vwmulu_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwmulu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -381,11 +437,12 @@ define <2 x i32> @vwmulu_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwmulu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vwmulu.vv v8, v10, v9 +; CHECK-NEXT: vzext.vf4 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -398,11 +455,12 @@ define <4 x i32> @vwmulu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwmulu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vzext.vf4 v11, v8 -; CHECK-NEXT: vwmulu.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf8 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -416,8 +474,12 @@ define <2 x i16> @vwmulu_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulu_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -432,8 +494,12 @@ define <4 x i16> @vwmulu_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulu_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -448,8 +514,12 @@ define <2 x i32> @vwmulu_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwmulu_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -464,8 +534,12 @@ define <8 x i16> @vwmulu_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwmulu_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwmulu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -480,8 +554,12 @@ define <4 x i32> @vwmulu_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwmulu_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwmulu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -496,8 +574,12 @@ define <2 x i64> @vwmulu_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwmulu_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwmulu.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vmul.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -513,7 +595,11 @@ define <16 x i16> @vwmulu_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwmulu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -529,7 +615,11 @@ define <8 x i32> @vwmulu_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwmulu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -545,7 +635,11 @@ define <4 x i64> @vwmulu_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwmulu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vmul.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -562,7 +656,11 @@ define <32 x i16> @vwmulu_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwmulu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -578,7 +676,11 @@ define <16 x i32> @vwmulu_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwmulu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -594,7 +696,11 @@ define <8 x i64> @vwmulu_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwmulu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vmul.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -611,7 +717,11 @@ define <64 x i16> @vwmulu_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwmulu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -628,7 +738,11 @@ define <32 x i32> @vwmulu_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwmulu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -644,7 +758,11 @@ define <16 x i64> @vwmulu_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwmulu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vmul.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -658,10 +776,11 @@ define <16 x i64> @vwmulu_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwmulu_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: lbu a0, 0(a1) -; CHECK-NEXT: vwmulu.vx v8, v9, a0 +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -694,10 +813,11 @@ define <8 x i16> @vwmulu_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwmulu_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lbu a0, 0(a1) -; CHECK-NEXT: vwmulu.vx v8, v9, a0 +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -712,10 +832,11 @@ define <4 x i32> @vwmulu_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwmulu_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: lhu a0, 0(a1) -; CHECK-NEXT: vwmulu.vx v8, v9, a0 +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vmul.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll index 2c9aed6274dd8..9d72b767e4840 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsll.ll @@ -19,10 +19,10 @@ define <4 x i64> @vwsll_vv_v4i64_sext(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_sext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v9 -; CHECK-ZVBB-NEXT: vmv1r.v v11, v8 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v11, v10 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf2 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <4 x i32> %a to <4 x i64> %y = sext <4 x i32> %b to <4 x i64> @@ -41,10 +41,10 @@ define <4 x i64> @vwsll_vv_v4i64_zext(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v9 -; CHECK-ZVBB-NEXT: vmv1r.v v11, v8 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v11, v10 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <4 x i32> %a to <4 x i64> %y = zext <4 x i32> %b to <4 x i64> @@ -62,9 +62,9 @@ define <4 x i64> @vwsll_vx_i64_v4i64(<4 x i32> %a, i64 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v4i64: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vx v8, v10, a0 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i64> poison, i64 %b, i32 0 %splat = shufflevector <4 x i64> %head, <4 x i64> poison, <4 x i32> zeroinitializer @@ -87,9 +87,11 @@ define <4 x i64> @vwsll_vx_i32_v4i64_sext(<4 x i32> %a, i32 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_sext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vmv.v.x v11, a0 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v10, v11 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i32> poison, i32 %b, i32 0 %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer @@ -113,8 +115,11 @@ define <4 x i64> @vwsll_vx_i32_v4i64_zext(<4 x i32> %a, i32 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_zext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i32> poison, i32 %b, i32 0 %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer @@ -137,9 +142,12 @@ define <4 x i64> @vwsll_vx_i16_v4i64_sext(<4 x i32> %a, i16 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_sext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf4 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i16> poison, i16 %b, i32 0 %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer @@ -162,9 +170,12 @@ define <4 x i64> @vwsll_vx_i16_v4i64_zext(<4 x i32> %a, i16 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf4 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i16> poison, i16 %b, i32 0 %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer @@ -187,9 +198,12 @@ define <4 x i64> @vwsll_vx_i8_v4i64_sext(<4 x i32> %a, i8 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_sext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf8 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i8> poison, i8 %b, i32 0 %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer @@ -212,9 +226,12 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf8 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i8> poison, i8 %b, i32 0 %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer @@ -227,17 +244,16 @@ define <4 x i64> @vwsll_vx_i8_v4i64_zext(<4 x i32> %a, i8 %b) { define <4 x i64> @vwsll_vi_v4i64(<4 x i32> %a) { ; CHECK-LABEL: vwsll_vi_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: li a0, 4 -; CHECK-NEXT: vwmulu.vx v8, v10, a0 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsll.vi v8, v10, 2 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vi v8, v10, 2 ; CHECK-ZVBB-NEXT: ret %x = zext <4 x i32> %a to <4 x i64> %z = shl <4 x i64> %x, splat (i64 2) @@ -259,10 +275,10 @@ define <8 x i32> @vwsll_vv_v8i32_sext(<8 x i16> %a, <8 x i16> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_sext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v9 -; CHECK-ZVBB-NEXT: vmv1r.v v11, v8 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v11, v10 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf2 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <8 x i16> %a to <8 x i32> %y = sext <8 x i16> %b to <8 x i32> @@ -281,10 +297,10 @@ define <8 x i32> @vwsll_vv_v8i32_zext(<8 x i16> %a, <8 x i16> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v8i32_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v9 -; CHECK-ZVBB-NEXT: vmv1r.v v11, v8 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v11, v10 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <8 x i16> %a to <8 x i32> %y = zext <8 x i16> %b to <8 x i32> @@ -302,9 +318,9 @@ define <8 x i32> @vwsll_vx_i64_v8i32(<8 x i16> %a, i64 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i64_v8i32: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vx v8, v10, a0 ; CHECK-ZVBB-NEXT: ret %head = insertelement <8 x i64> poison, i64 %b, i32 0 %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <8 x i32> zeroinitializer @@ -324,9 +340,9 @@ define <8 x i32> @vwsll_vx_i32_v8i32(<8 x i16> %a, i32 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v8i32: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vx v8, v10, a0 ; CHECK-ZVBB-NEXT: ret %head = insertelement <8 x i32> poison, i32 %b, i32 0 %splat = shufflevector <8 x i32> %head, <8 x i32> poison, <8 x i32> zeroinitializer @@ -349,9 +365,11 @@ define <8 x i32> @vwsll_vx_i16_v8i32_sext(<8 x i16> %a, i16 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v8i32_sext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vmv.v.x v11, a0 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v10, v11 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <8 x i16> poison, i16 %b, i32 0 %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer @@ -375,8 +393,11 @@ define <8 x i32> @vwsll_vx_i16_v8i32_zext(<8 x i16> %a, i16 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v8i32_zext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <8 x i16> poison, i16 %b, i32 0 %splat = shufflevector <8 x i16> %head, <8 x i16> poison, <8 x i32> zeroinitializer @@ -399,9 +420,12 @@ define <8 x i32> @vwsll_vx_i8_v8i32_sext(<8 x i16> %a, i8 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_sext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf4 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <8 x i8> poison, i8 %b, i32 0 %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -424,9 +448,12 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v8i32_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf4 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <8 x i8> poison, i8 %b, i32 0 %splat = shufflevector <8 x i8> %head, <8 x i8> poison, <8 x i32> zeroinitializer @@ -439,17 +466,16 @@ define <8 x i32> @vwsll_vx_i8_v8i32_zext(<8 x i16> %a, i8 %b) { define <8 x i32> @vwsll_vi_v8i32(<8 x i16> %a) { ; CHECK-LABEL: vwsll_vi_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: li a0, 4 -; CHECK-NEXT: vwmulu.vx v8, v10, a0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsll.vi v8, v10, 2 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vwsll_vi_v8i32: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2 +; CHECK-ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vi v8, v10, 2 ; CHECK-ZVBB-NEXT: ret %x = zext <8 x i16> %a to <8 x i32> %z = shl <8 x i32> %x, splat (i32 2) @@ -471,10 +497,10 @@ define <16 x i16> @vwsll_vv_v16i16_sext(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_sext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v9 -; CHECK-ZVBB-NEXT: vmv1r.v v11, v8 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v11, v10 +; CHECK-ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf2 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <16 x i8> %a to <16 x i16> %y = sext <16 x i8> %b to <16 x i16> @@ -493,10 +519,10 @@ define <16 x i16> @vwsll_vv_v16i16_zext(<16 x i8> %a, <16 x i8> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v16i16_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v9 -; CHECK-ZVBB-NEXT: vmv1r.v v11, v8 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v11, v10 +; CHECK-ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <16 x i8> %a to <16 x i16> %y = zext <16 x i8> %b to <16 x i16> @@ -550,9 +576,15 @@ define <16 x i16> @vwsll_vx_i64_v16i16(<16 x i8> %a, i64 %b) { ; ; CHECK-ZVBB-RV64-LABEL: vwsll_vx_i64_v16i16: ; CHECK-ZVBB-RV64: # %bb.0: -; CHECK-ZVBB-RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-RV64-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-RV64-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-ZVBB-RV64-NEXT: vmv.v.x v16, a0 +; CHECK-ZVBB-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-ZVBB-RV64-NEXT: vzext.vf2 v12, v8 +; CHECK-ZVBB-RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-ZVBB-RV64-NEXT: vnsrl.wi v8, v16, 0 +; CHECK-ZVBB-RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-ZVBB-RV64-NEXT: vnsrl.wi v14, v8, 0 +; CHECK-ZVBB-RV64-NEXT: vsll.vv v8, v12, v14 ; CHECK-ZVBB-RV64-NEXT: ret %head = insertelement <8 x i64> poison, i64 %b, i32 0 %splat = shufflevector <8 x i64> %head, <8 x i64> poison, <16 x i32> zeroinitializer @@ -575,9 +607,12 @@ define <16 x i16> @vwsll_vx_i32_v16i16(<16 x i8> %a, i32 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v16i16: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <16 x i32> poison, i32 %b, i32 0 %splat = shufflevector <16 x i32> %head, <16 x i32> poison, <16 x i32> zeroinitializer @@ -597,9 +632,9 @@ define <16 x i16> @vwsll_vx_i16_v16i16(<16 x i8> %a, i16 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v16i16: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vx v8, v10, a0 ; CHECK-ZVBB-NEXT: ret %head = insertelement <16 x i16> poison, i16 %b, i32 0 %splat = shufflevector <16 x i16> %head, <16 x i16> poison, <16 x i32> zeroinitializer @@ -622,9 +657,11 @@ define <16 x i16> @vwsll_vx_i8_v16i16_sext(<16 x i8> %a, i8 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v16i16_sext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vmv.v.x v11, a0 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v10, v11 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <16 x i8> poison, i8 %b, i32 0 %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer @@ -648,8 +685,11 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v16i16_zext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <16 x i8> poison, i8 %b, i32 0 %splat = shufflevector <16 x i8> %head, <16 x i8> poison, <16 x i32> zeroinitializer @@ -662,17 +702,16 @@ define <16 x i16> @vwsll_vx_i8_v16i16_zext(<16 x i8> %a, i8 %b) { define <16 x i16> @vwsll_vi_v16i16(<16 x i8> %a) { ; CHECK-LABEL: vwsll_vi_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: li a0, 4 -; CHECK-NEXT: vwmulu.vx v8, v10, a0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vsll.vi v8, v10, 2 ; CHECK-NEXT: ret ; ; CHECK-ZVBB-LABEL: vwsll_vi_v16i16: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-ZVBB-NEXT: vmv1r.v v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2 +; CHECK-ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf2 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vi v8, v10, 2 ; CHECK-ZVBB-NEXT: ret %x = zext <16 x i8> %a to <16 x i16> %z = shl <16 x i16> %x, splat (i16 2) @@ -716,10 +755,10 @@ define <4 x i64> @vwsll_vv_v4i64_v4i8_zext(<4 x i8> %a, <4 x i8> %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vv_v4i64_v4i8_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 -; CHECK-ZVBB-NEXT: vzext.vf4 v11, v9 -; CHECK-ZVBB-NEXT: vwsll.vv v8, v10, v11 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf8 v12, v9 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v12 ; CHECK-ZVBB-NEXT: ret %x = zext <4 x i8> %a to <4 x i64> %y = zext <4 x i8> %b to <4 x i64> @@ -790,8 +829,11 @@ define <4 x i64> @vwsll_vx_i32_v4i64_v4i8_zext(<4 x i8> %a, i32 %b) { ; CHECK-ZVBB-LABEL: vwsll_vx_i32_v4i64_v4i8_zext: ; CHECK-ZVBB: # %bb.0: ; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf2 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i32> poison, i32 %b, i32 0 %splat = shufflevector <4 x i32> %head, <4 x i32> poison, <4 x i32> zeroinitializer @@ -842,9 +884,12 @@ define <4 x i64> @vwsll_vx_i16_v4i64_v4i8_zext(<4 x i8> %a, i16 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i16_v4i64_v4i8_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf4 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i16> poison, i16 %b, i32 0 %splat = shufflevector <4 x i16> %head, <4 x i16> poison, <4 x i32> zeroinitializer @@ -895,9 +940,12 @@ define <4 x i64> @vwsll_vx_i8_v4i64_v4i8_zext(<4 x i8> %a, i8 %b) { ; ; CHECK-ZVBB-LABEL: vwsll_vx_i8_v4i64_v4i8_zext: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vx v8, v10, a0 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-ZVBB-NEXT: vmv.v.x v12, a0 +; CHECK-ZVBB-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8 +; CHECK-ZVBB-NEXT: vzext.vf8 v8, v12 +; CHECK-ZVBB-NEXT: vsll.vv v8, v10, v8 ; CHECK-ZVBB-NEXT: ret %head = insertelement <4 x i8> poison, i8 %b, i32 0 %splat = shufflevector <4 x i8> %head, <4 x i8> poison, <4 x i32> zeroinitializer @@ -917,9 +965,9 @@ define <4 x i64> @vwsll_vi_v4i64_v4i8(<4 x i8> %a) { ; ; CHECK-ZVBB-LABEL: vwsll_vi_v4i64_v4i8: ; CHECK-ZVBB: # %bb.0: -; CHECK-ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-ZVBB-NEXT: vzext.vf4 v10, v8 -; CHECK-ZVBB-NEXT: vwsll.vi v8, v10, 2 +; CHECK-ZVBB-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-ZVBB-NEXT: vzext.vf8 v10, v8 +; CHECK-ZVBB-NEXT: vsll.vi v8, v10, 2 ; CHECK-ZVBB-NEXT: ret %x = zext <4 x i8> %a to <4 x i64> %z = shl <4 x i64> %x, splat (i64 2) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll index eafea7292a54b..622708d36a912 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub-mask.ll @@ -5,13 +5,14 @@ define <8 x i64> @vwsub_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { ; CHECK-LABEL: vwsub_wv_mask_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v16, v8 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv4r.v v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; CHECK-NEXT: vwsub.wv v8, v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsub.vv v8, v12, v8 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -23,13 +24,14 @@ define <8 x i64> @vwsub_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { define <8 x i64> @vwsubu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { ; CHECK-LABEL: vwsubu_wv_mask_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v16, v8 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv4r.v v8, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, mu -; CHECK-NEXT: vwsubu.wv v8, v8, v16, v0.t +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vsub.vv v8, v12, v8 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -41,13 +43,15 @@ define <8 x i64> @vwsubu_wv_mask_v8i32(<8 x i32> %x, <8 x i64> %y) { define <8 x i64> @vwsubu_vv_mask_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: vwsubu_vv_mask_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vmv2r.v v12, v10 ; CHECK-NEXT: li a0, 42 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmslt.vx v0, v8, a0 -; CHECK-NEXT: vmv.v.i v10, 0 -; CHECK-NEXT: vmerge.vvm v14, v10, v8, v0 -; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vzext.vf2 v16, v10 +; CHECK-NEXT: vsub.vv v8, v16, v12 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -65,7 +69,9 @@ define <8 x i64> @vwsub_wv_mask_v8i32_nonzero(<8 x i32> %x, <8 x i64> %y) { ; CHECK-NEXT: vmslt.vx v0, v8, a0 ; CHECK-NEXT: vmv.v.i v10, 1 ; CHECK-NEXT: vmerge.vvm v16, v10, v8, v0 -; CHECK-NEXT: vwsub.wv v8, v12, v16 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsub.vv v8, v12, v8 ; CHECK-NEXT: ret %mask = icmp slt <8 x i32> %x, %a = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll index 783de24100613..9dd13180510dc 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwsub_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,10 +23,12 @@ define <2 x i16> @vwsub_v2i16(ptr %x, ptr %y) { define <4 x i16> @vwsub_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -37,10 +41,12 @@ define <4 x i16> @vwsub_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwsub_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -53,10 +59,12 @@ define <2 x i32> @vwsub_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwsub_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -69,10 +77,12 @@ define <8 x i16> @vwsub_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwsub_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -85,10 +95,12 @@ define <4 x i32> @vwsub_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwsub_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwsub.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -101,10 +113,12 @@ define <2 x i64> @vwsub_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwsub_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -117,10 +131,12 @@ define <16 x i16> @vwsub_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwsub_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -133,10 +149,12 @@ define <8 x i32> @vwsub_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwsub_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -150,10 +168,12 @@ define <32 x i16> @vwsub_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwsub.vv v8, v12, v14 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -166,10 +186,12 @@ define <32 x i16> @vwsub_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwsub_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwsub.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -182,10 +204,12 @@ define <16 x i32> @vwsub_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwsub_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwsub.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -199,10 +223,12 @@ define <64 x i16> @vwsub_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwsub.vv v8, v16, v20 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -216,10 +242,12 @@ define <32 x i32> @vwsub_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwsub.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -232,10 +260,12 @@ define <32 x i32> @vwsub_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwsub_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwsub.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -255,18 +285,26 @@ define <128 x i16> @vwsub_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsub.vv v8, v8, v0 +; CHECK-NEXT: vsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -290,18 +328,26 @@ define <64 x i32> @vwsub_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwsub.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsub.vv v8, v8, v0 +; CHECK-NEXT: vsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -325,16 +371,24 @@ define <32 x i64> @vwsub_v32i64(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwsub.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsub.vv v16, v24, v0 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vsext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsub.vv v8, v8, v0 +; CHECK-NEXT: vsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -351,12 +405,12 @@ define <32 x i64> @vwsub_v32i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwsub_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vsext.vf2 v11, v9 -; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf4 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -369,11 +423,12 @@ define <2 x i32> @vwsub_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwsub_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -386,11 +441,12 @@ define <4 x i32> @vwsub_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwsub_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vsext.vf4 v11, v8 -; CHECK-NEXT: vwsub.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf8 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -404,8 +460,12 @@ define <2 x i16> @vwsub_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwsub_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -420,8 +480,12 @@ define <4 x i16> @vwsub_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwsub_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -436,8 +500,12 @@ define <2 x i32> @vwsub_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwsub_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -452,8 +520,12 @@ define <8 x i16> @vwsub_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwsub_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -468,8 +540,12 @@ define <4 x i32> @vwsub_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwsub_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -484,8 +560,12 @@ define <2 x i64> @vwsub_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwsub_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwsub.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -501,7 +581,11 @@ define <16 x i16> @vwsub_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwsub.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -517,7 +601,11 @@ define <8 x i32> @vwsub_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwsub.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -533,7 +621,11 @@ define <4 x i64> @vwsub_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwsub.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -550,7 +642,11 @@ define <32 x i16> @vwsub_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwsub.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -566,7 +662,11 @@ define <16 x i32> @vwsub_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwsub.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -582,7 +682,11 @@ define <8 x i64> @vwsub_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwsub.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v12 +; CHECK-NEXT: vsext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -599,7 +703,11 @@ define <64 x i16> @vwsub_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwsub.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -616,7 +724,11 @@ define <32 x i32> @vwsub_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwsub.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -632,7 +744,11 @@ define <16 x i64> @vwsub_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwsub.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vsext.vf2 v8, v16 +; CHECK-NEXT: vsext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -646,11 +762,11 @@ define <16 x i64> @vwsub_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -665,12 +781,11 @@ define <8 x i16> @vwsub_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vwsub.wv v8, v8, v9 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i16, ptr %y @@ -684,11 +799,11 @@ define <8 x i16> @vwsub_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -703,11 +818,11 @@ define <4 x i32> @vwsub_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y @@ -722,12 +837,11 @@ define <4 x i32> @vwsub_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vwsub.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i32, ptr %y @@ -741,11 +855,11 @@ define <4 x i32> @vwsub_vx_v4i32_i32(ptr %x, ptr %y) { define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwsub_vx_v2i64_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lb a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lb a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -760,11 +874,11 @@ define <2 x i64> @vwsub_vx_v2i64_i8(ptr %x, ptr %y) nounwind { define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwsub_vx_v2i64_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -779,11 +893,11 @@ define <2 x i64> @vwsub_vx_v2i64_i16(ptr %x, ptr %y) nounwind { define <2 x i64> @vwsub_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; CHECK-LABEL: vwsub_vx_v2i64_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsub.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vsext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y @@ -801,24 +915,24 @@ define <2 x i64> @vwsub_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a2, 0(a1) ; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwsub.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vsext.vf2 v10, v8 +; RV32-NEXT: vsub.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwsub_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwsub.wv v8, v8, v9 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vsext.vf2 v9, v8 +; RV64-NEXT: vrsub.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i64, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll index bfdda47cc819e..e4423de84d8f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -5,10 +5,12 @@ define <2 x i16> @vwsubu_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -21,10 +23,12 @@ define <2 x i16> @vwsubu_v2i16(ptr %x, ptr %y) { define <4 x i16> @vwsubu_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i8>, ptr %y @@ -37,10 +41,12 @@ define <4 x i16> @vwsubu_v4i16(ptr %x, ptr %y) { define <2 x i32> @vwsubu_v2i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y @@ -53,10 +59,12 @@ define <2 x i32> @vwsubu_v2i32(ptr %x, ptr %y) { define <8 x i16> @vwsubu_v8i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vle8.v v10, (a1) -; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vle8.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load <8 x i8>, ptr %y @@ -69,10 +77,12 @@ define <8 x i16> @vwsubu_v8i16(ptr %x, ptr %y) { define <4 x i32> @vwsubu_v4i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vle16.v v10, (a1) -; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vle16.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load <4 x i16>, ptr %y @@ -85,10 +95,12 @@ define <4 x i32> @vwsubu_v4i32(ptr %x, ptr %y) { define <2 x i64> @vwsubu_v2i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vle32.v v10, (a1) -; CHECK-NEXT: vwsubu.vv v8, v9, v10 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vle32.v v9, (a1) +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = load <2 x i32>, ptr %y @@ -101,10 +113,12 @@ define <2 x i64> @vwsubu_v2i64(ptr %x, ptr %y) { define <16 x i16> @vwsubu_v16i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v16i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vle8.v v11, (a1) -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = load <16 x i8>, ptr %y @@ -117,10 +131,12 @@ define <16 x i16> @vwsubu_v16i16(ptr %x, ptr %y) { define <8 x i32> @vwsubu_v8i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vle16.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = load <8 x i16>, ptr %y @@ -133,10 +149,12 @@ define <8 x i32> @vwsubu_v8i32(ptr %x, ptr %y) { define <4 x i64> @vwsubu_v4i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vle32.v v11, (a1) -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vle32.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i32>, ptr %y @@ -150,10 +168,12 @@ define <32 x i16> @vwsubu_v32i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v32i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vle8.v v14, (a1) -; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = load <32 x i8>, ptr %y @@ -166,10 +186,12 @@ define <32 x i16> @vwsubu_v32i16(ptr %x, ptr %y) { define <16 x i32> @vwsubu_v16i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v16i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vle16.v v14, (a1) -; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: vle16.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = load <16 x i16>, ptr %y @@ -182,10 +204,12 @@ define <16 x i32> @vwsubu_v16i32(ptr %x, ptr %y) { define <8 x i64> @vwsubu_v8i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v8i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vle32.v v14, (a1) -; CHECK-NEXT: vwsubu.vv v8, v12, v14 +; CHECK-NEXT: vle32.v v16, (a1) +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = load <8 x i32>, ptr %y @@ -199,10 +223,12 @@ define <64 x i16> @vwsubu_v64i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v64i16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 64 -; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v20, (a1) -; CHECK-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = load <64 x i8>, ptr %y @@ -216,10 +242,12 @@ define <32 x i32> @vwsubu_v32i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v32i32: ; CHECK: # %bb.0: ; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v20, (a1) -; CHECK-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = load <32 x i16>, ptr %y @@ -232,10 +260,12 @@ define <32 x i32> @vwsubu_v32i32(ptr %x, ptr %y) { define <16 x i64> @vwsubu_v16i64(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v16i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v20, (a1) -; CHECK-NEXT: vwsubu.vv v8, v16, v20 +; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = load <16 x i32>, ptr %y @@ -255,18 +285,26 @@ define <128 x i16> @vwsubu_v128i16(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 128 ; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vle8.v v8, (a1) ; CHECK-NEXT: li a0, 64 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsub.vv v8, v8, v0 +; CHECK-NEXT: vsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -290,18 +328,26 @@ define <64 x i32> @vwsubu_v64i32(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vle16.v v24, (a1) +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: li a0, 32 ; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v16, a0 +; CHECK-NEXT: vslidedown.vx v24, v16, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vx v0, v24, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vwsubu.vv v8, v16, v24 +; CHECK-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsub.vv v8, v8, v0 +; CHECK-NEXT: vsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -325,16 +371,24 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vle32.v v24, (a1) +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v16, 16 +; CHECK-NEXT: vslidedown.vi v24, v16, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v16 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vslidedown.vi v0, v24, 16 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vwsubu.vv v8, v16, v24 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vwsubu.vv v16, v24, v0 +; CHECK-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; CHECK-NEXT: vslidedown.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v0, v8 +; CHECK-NEXT: vmv4r.v v8, v24 +; CHECK-NEXT: vzext.vf2 v24, v8 +; CHECK-NEXT: vmv4r.v v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v8 +; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsub.vv v8, v8, v0 +; CHECK-NEXT: vsub.vv v16, v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 @@ -351,12 +405,14 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i32_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -369,11 +425,12 @@ define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) { define <4 x i32> @vwsubu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v4i32_v4i8_v4i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vzext.vf2 v10, v8 -; CHECK-NEXT: vwsubu.vv v8, v10, v9 +; CHECK-NEXT: vzext.vf4 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = load <4 x i16>, ptr %y @@ -386,11 +443,12 @@ define <4 x i32> @vwsubu_v4i32_v4i8_v4i16(ptr %x, ptr %y) { define <4 x i64> @vwsubu_v4i64_v4i32_v4i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v4i64_v4i32_v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle8.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vzext.vf4 v11, v8 -; CHECK-NEXT: vwsubu.vv v8, v10, v11 +; CHECK-NEXT: vle8.v v12, (a1) +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf8 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = load <4 x i8>, ptr %y @@ -404,8 +462,12 @@ define <2 x i16> @vwsubu_vx_v2i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwsubu_vx_v2i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = insertelement <2 x i8> poison, i8 %y, i32 0 @@ -420,8 +482,12 @@ define <4 x i16> @vwsubu_vx_v4i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwsubu_vx_v4i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i8>, ptr %x %b = insertelement <4 x i8> poison, i8 %y, i32 0 @@ -436,8 +502,12 @@ define <2 x i32> @vwsubu_vx_v2i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwsubu_vx_v2i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = insertelement <2 x i16> poison, i16 %y, i32 0 @@ -452,8 +522,12 @@ define <8 x i16> @vwsubu_vx_v8i16(ptr %x, i8 %y) { ; CHECK-LABEL: vwsubu_vx_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = insertelement <8 x i8> poison, i8 %y, i32 0 @@ -468,8 +542,12 @@ define <4 x i32> @vwsubu_vx_v4i32(ptr %x, i16 %y) { ; CHECK-LABEL: vwsubu_vx_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = insertelement <4 x i16> poison, i16 %y, i32 0 @@ -484,8 +562,12 @@ define <2 x i64> @vwsubu_vx_v2i64(ptr %x, i32 %y) { ; CHECK-LABEL: vwsubu_vx_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vwsubu.vx v8, v9, a1 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v8, v10, v8 ; CHECK-NEXT: ret %a = load <2 x i32>, ptr %x %b = insertelement <2 x i32> poison, i32 %y, i64 0 @@ -501,7 +583,11 @@ define <16 x i16> @vwsubu_vx_v16i16(ptr %x, i8 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <16 x i8>, ptr %x %b = insertelement <16 x i8> poison, i8 %y, i32 0 @@ -517,7 +603,11 @@ define <8 x i32> @vwsubu_vx_v8i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <8 x i16>, ptr %x %b = insertelement <8 x i16> poison, i16 %y, i32 0 @@ -533,7 +623,11 @@ define <4 x i64> @vwsubu_vx_v4i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vle32.v v10, (a0) -; CHECK-NEXT: vwsubu.vx v8, v10, a1 +; CHECK-NEXT: vmv.v.x v12, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v10 +; CHECK-NEXT: vzext.vf2 v10, v12 +; CHECK-NEXT: vsub.vv v8, v8, v10 ; CHECK-NEXT: ret %a = load <4 x i32>, ptr %x %b = insertelement <4 x i32> poison, i32 %y, i64 0 @@ -550,7 +644,11 @@ define <32 x i16> @vwsubu_vx_v32i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e8, m2, ta, ma ; CHECK-NEXT: vle8.v v12, (a0) -; CHECK-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <32 x i8>, ptr %x %b = insertelement <32 x i8> poison, i8 %y, i32 0 @@ -566,7 +664,11 @@ define <16 x i32> @vwsubu_vx_v16i32(ptr %x, i16 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; CHECK-NEXT: vle16.v v12, (a0) -; CHECK-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <16 x i16>, ptr %x %b = insertelement <16 x i16> poison, i16 %y, i32 0 @@ -582,7 +684,11 @@ define <8 x i64> @vwsubu_vx_v8i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vwsubu.vx v8, v12, a1 +; CHECK-NEXT: vmv.v.x v16, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v12 +; CHECK-NEXT: vzext.vf2 v12, v16 +; CHECK-NEXT: vsub.vv v8, v8, v12 ; CHECK-NEXT: ret %a = load <8 x i32>, ptr %x %b = insertelement <8 x i32> poison, i32 %y, i64 0 @@ -599,7 +705,11 @@ define <64 x i16> @vwsubu_vx_v64i16(ptr %x, i8 %y) { ; CHECK-NEXT: li a2, 64 ; CHECK-NEXT: vsetvli zero, a2, e8, m4, ta, ma ; CHECK-NEXT: vle8.v v16, (a0) -; CHECK-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <64 x i8>, ptr %x %b = insertelement <64 x i8> poison, i8 %y, i32 0 @@ -616,7 +726,11 @@ define <32 x i32> @vwsubu_vx_v32i32(ptr %x, i16 %y) { ; CHECK-NEXT: li a2, 32 ; CHECK-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <32 x i16>, ptr %x %b = insertelement <32 x i16> poison, i16 %y, i32 0 @@ -632,7 +746,11 @@ define <16 x i64> @vwsubu_vx_v16i64(ptr %x, i32 %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vle32.v v16, (a0) -; CHECK-NEXT: vwsubu.vx v8, v16, a1 +; CHECK-NEXT: vmv.v.x v24, a1 +; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; CHECK-NEXT: vzext.vf2 v8, v16 +; CHECK-NEXT: vzext.vf2 v16, v24 +; CHECK-NEXT: vsub.vv v8, v8, v16 ; CHECK-NEXT: ret %a = load <16 x i32>, ptr %x %b = insertelement <16 x i32> poison, i32 %y, i64 0 @@ -646,11 +764,11 @@ define <16 x i64> @vwsubu_vx_v16i64(ptr %x, i32 %y) { define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsubu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i8, ptr %y @@ -665,12 +783,11 @@ define <8 x i16> @vwsubu_vx_v8i16_i8(ptr %x, ptr %y) { define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma -; CHECK-NEXT: vwsubu.wv v8, v8, v9 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lh a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <8 x i8>, ptr %x %b = load i16, ptr %y @@ -684,11 +801,11 @@ define <8 x i16> @vwsubu_vx_v8i16_i16(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: lbu a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsubu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lbu a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i8, ptr %y @@ -703,11 +820,11 @@ define <4 x i32> @vwsubu_vx_v4i32_i8(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lhu a1, 0(a1) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v10, a1 -; CHECK-NEXT: vwsubu.vv v8, v10, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lhu a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i16, ptr %y @@ -722,12 +839,11 @@ define <4 x i32> @vwsubu_vx_v4i32_i16(ptr %x, ptr %y) { define <4 x i32> @vwsubu_vx_v4i32_i32(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vwsubu.wv v8, v8, v9 +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: lw a0, 0(a1) +; CHECK-NEXT: vzext.vf2 v9, v8 +; CHECK-NEXT: vrsub.vx v8, v9, a0 ; CHECK-NEXT: ret %a = load <4 x i16>, ptr %x %b = load i32, ptr %y @@ -743,23 +859,24 @@ define <2 x i64> @vwsubu_vx_v2i64_i8(ptr %x, ptr %y) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lbu a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwsubu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsub.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwsubu_vx_v2i64_i8: ; RV64: # %bb.0: -; RV64-NEXT: lbu a1, 0(a1) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vwsubu.vv v8, v10, v9 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vrsub.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -776,23 +893,24 @@ define <2 x i64> @vwsubu_vx_v2i64_i16(ptr %x, ptr %y) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lhu a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwsubu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsub.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwsubu_vx_v2i64_i16: ; RV64: # %bb.0: -; RV64-NEXT: lhu a1, 0(a1) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vwsubu.vv v8, v10, v9 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: lhu a0, 0(a1) +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vrsub.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -809,23 +927,24 @@ define <2 x i64> @vwsubu_vx_v2i64_i32(ptr %x, ptr %y) nounwind { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwsubu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsub.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwsubu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: lwu a1, 0(a1) -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v10, a1 -; RV64-NEXT: vwsubu.vv v8, v10, v9 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: lwu a0, 0(a1) +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vrsub.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y @@ -843,24 +962,24 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: lw a2, 0(a1) ; RV32-NEXT: lw a1, 4(a1) -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a2, 8(sp) ; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v8, (a0), zero -; RV32-NEXT: vwsubu.wv v8, v8, v9 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vsub.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vwsubu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; RV64-NEXT: vwsubu.wv v8, v8, v9 +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: ld a0, 0(a1) +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vrsub.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i64, ptr %y @@ -874,12 +993,14 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind { define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i32_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -892,12 +1013,14 @@ define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i64_of_v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vle8.v v9, (a1) -; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vsext.vf4 v8, v10 +; CHECK-NEXT: vsext.vf4 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i8>, ptr %x %b = load <2 x i8>, ptr %y @@ -910,12 +1033,14 @@ define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) { define <2 x i64> @vwsubu_v2i64_of_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwsubu_v2i64_of_v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vwsubu.vv v10, v8, v9 +; CHECK-NEXT: vzext.vf2 v10, v8 +; CHECK-NEXT: vzext.vf2 v8, v9 +; CHECK-NEXT: vsub.vv v9, v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vsext.vf2 v8, v10 +; CHECK-NEXT: vsext.vf2 v8, v9 ; CHECK-NEXT: ret %a = load <2 x i16>, ptr %x %b = load <2 x i16>, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll index a189711d11471..73d1900b448eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-zvqdotq.ll @@ -7,11 +7,10 @@ define i32 @vqdot_vv(<16 x i8> %a, <16 x i8> %b) { ; NODOT-LABEL: vqdot_vv: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vsext.vf2 v14, v9 -; NODOT-NEXT: vwmul.vv v8, v12, v14 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vsext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v8, v12, v16 ; NODOT-NEXT: vmv.s.x v12, zero ; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 @@ -37,11 +36,10 @@ entry: define i32 @vqdot_vx_constant(<16 x i8> %a) { ; CHECK-LABEL: vqdot_vx_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf4 v12, v8 ; CHECK-NEXT: li a0, 23 -; CHECK-NEXT: vwmul.vx v8, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v12, a0 ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 @@ -56,11 +54,10 @@ entry: define i32 @vqdot_vx_constant_swapped(<16 x i8> %a) { ; CHECK-LABEL: vqdot_vx_constant_swapped: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf4 v12, v8 ; CHECK-NEXT: li a0, 23 -; CHECK-NEXT: vwmul.vx v8, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v12, a0 ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 @@ -75,13 +72,14 @@ entry: define i32 @vqdotu_vv(<16 x i8> %a, <16 x i8> %b) { ; NODOT-LABEL: vqdotu_vv: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; NODOT-NEXT: vwmulu.vv v10, v8, v9 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; NODOT-NEXT: vmv.s.x v8, zero -; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; NODOT-NEXT: vwredsumu.vs v8, v10, v8 +; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; NODOT-NEXT: vzext.vf2 v10, v9 +; NODOT-NEXT: vzext.vf2 v12, v8 +; NODOT-NEXT: vmul.vv v12, v12, v10 ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vzext.vf2 v8, v12 +; NODOT-NEXT: vmv.s.x v12, zero +; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; @@ -105,11 +103,10 @@ entry: define i32 @vqdotu_vx_constant(<16 x i8> %a) { ; CHECK-LABEL: vqdotu_vx_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf4 v12, v8 ; CHECK-NEXT: li a0, 123 -; CHECK-NEXT: vwmulu.vx v8, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v12, a0 ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 @@ -124,11 +121,10 @@ entry: define i32 @vqdotsu_vv(<16 x i8> %a, <16 x i8> %b) { ; NODOT-LABEL: vqdotsu_vv: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vzext.vf2 v14, v9 -; NODOT-NEXT: vwmulsu.vv v8, v12, v14 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vzext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v8, v12, v16 ; NODOT-NEXT: vmv.s.x v12, zero ; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 @@ -154,11 +150,10 @@ entry: define i32 @vqdotsu_vv_swapped(<16 x i8> %a, <16 x i8> %b) { ; NODOT-LABEL: vqdotsu_vv_swapped: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vzext.vf2 v14, v9 -; NODOT-NEXT: vwmulsu.vv v8, v12, v14 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vzext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v8, v16, v12 ; NODOT-NEXT: vmv.s.x v12, zero ; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 @@ -184,11 +179,10 @@ entry: define i32 @vdotqsu_vx_constant(<16 x i8> %a) { ; CHECK-LABEL: vdotqsu_vx_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf4 v12, v8 ; CHECK-NEXT: li a0, 123 -; CHECK-NEXT: vwmul.vx v8, v12, a0 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v12, a0 ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 @@ -203,12 +197,10 @@ entry: define i32 @vdotqus_vx_constant(<16 x i8> %a) { ; CHECK-LABEL: vdotqus_vx_constant: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vzext.vf4 v12, v8 ; CHECK-NEXT: li a0, -23 -; CHECK-NEXT: vmv.v.x v14, a0 -; CHECK-NEXT: vwmulsu.vv v8, v14, v12 -; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; CHECK-NEXT: vmul.vx v8, v12, a0 ; CHECK-NEXT: vmv.s.x v12, zero ; CHECK-NEXT: vredsum.vs v8, v8, v12 ; CHECK-NEXT: vmv.x.s a0, v8 @@ -277,23 +269,24 @@ entry: define i32 @vqdot_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { ; NODOT-LABEL: vqdot_vv_accum: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v10, v8 -; NODOT-NEXT: vsext.vf2 v16, v9 -; NODOT-NEXT: vwmacc.vv v12, v10, v16 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; NODOT-NEXT: vmv.s.x v8, zero -; NODOT-NEXT: vredsum.vs v8, v12, v8 +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v16, v8 +; NODOT-NEXT: vsext.vf4 v20, v9 +; NODOT-NEXT: vmul.vv v8, v16, v20 +; NODOT-NEXT: vadd.vv v8, v8, v12 +; NODOT-NEXT: vmv.s.x v12, zero +; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdot_vv_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vqdot.vv v16, v8, v9 +; DOT-NEXT: vmv.v.i v10, 0 +; DOT-NEXT: vqdot.vv v10, v8, v9 +; DOT-NEXT: vadd.vv v8, v10, v12 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; DOT-NEXT: vmv.v.v v12, v16 +; DOT-NEXT: vmv.v.v v12, v8 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; DOT-NEXT: vredsum.vs v8, v12, v8 @@ -311,23 +304,26 @@ entry: define i32 @vqdotu_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { ; NODOT-LABEL: vqdotu_vv_accum: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; NODOT-NEXT: vwmulu.vv v10, v8, v9 -; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; NODOT-NEXT: vwaddu.wv v12, v12, v10 +; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; NODOT-NEXT: vzext.vf2 v10, v9 +; NODOT-NEXT: vzext.vf2 v16, v8 +; NODOT-NEXT: vmul.vv v16, v16, v10 ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; NODOT-NEXT: vmv.s.x v8, zero -; NODOT-NEXT: vredsum.vs v8, v12, v8 +; NODOT-NEXT: vzext.vf2 v8, v16 +; NODOT-NEXT: vadd.vv v8, v8, v12 +; NODOT-NEXT: vmv.s.x v12, zero +; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdotu_vv_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vqdotu.vv v16, v8, v9 +; DOT-NEXT: vmv.v.i v10, 0 +; DOT-NEXT: vqdotu.vv v10, v8, v9 +; DOT-NEXT: vadd.vv v8, v10, v12 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; DOT-NEXT: vmv.v.v v12, v16 +; DOT-NEXT: vmv.v.v v12, v8 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; DOT-NEXT: vredsum.vs v8, v12, v8 @@ -345,23 +341,24 @@ entry: define i32 @vqdotsu_vv_accum(<16 x i8> %a, <16 x i8> %b, <16 x i32> %x) { ; NODOT-LABEL: vqdotsu_vv_accum: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v10, v8 -; NODOT-NEXT: vzext.vf2 v16, v9 -; NODOT-NEXT: vwmaccsu.vv v12, v10, v16 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; NODOT-NEXT: vmv.s.x v8, zero -; NODOT-NEXT: vredsum.vs v8, v12, v8 +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v16, v8 +; NODOT-NEXT: vzext.vf4 v20, v9 +; NODOT-NEXT: vmul.vv v8, v16, v20 +; NODOT-NEXT: vadd.vv v8, v8, v12 +; NODOT-NEXT: vmv.s.x v12, zero +; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdotsu_vv_accum: ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; DOT-NEXT: vmv1r.v v16, v12 -; DOT-NEXT: vqdotsu.vv v16, v8, v9 +; DOT-NEXT: vmv.v.i v10, 0 +; DOT-NEXT: vqdotsu.vv v10, v8, v9 +; DOT-NEXT: vadd.vv v8, v10, v12 ; DOT-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; DOT-NEXT: vmv.v.v v12, v16 +; DOT-NEXT: vmv.v.v v12, v8 ; DOT-NEXT: vmv.s.x v8, zero ; DOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; DOT-NEXT: vredsum.vs v8, v12, v8 @@ -379,11 +376,10 @@ entry: define i32 @vqdot_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { ; NODOT-LABEL: vqdot_vv_scalar_add: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vsext.vf2 v14, v9 -; NODOT-NEXT: vwmul.vv v8, v12, v14 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vsext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v8, v12, v16 ; NODOT-NEXT: vmv.s.x v12, a0 ; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 @@ -410,13 +406,14 @@ entry: define i32 @vqdotu_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { ; NODOT-LABEL: vqdotu_vv_scalar_add: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; NODOT-NEXT: vwmulu.vv v10, v8, v9 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma -; NODOT-NEXT: vmv.s.x v8, a0 -; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; NODOT-NEXT: vwredsumu.vs v8, v10, v8 +; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; NODOT-NEXT: vzext.vf2 v10, v9 +; NODOT-NEXT: vzext.vf2 v12, v8 +; NODOT-NEXT: vmul.vv v12, v12, v10 ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vzext.vf2 v8, v12 +; NODOT-NEXT: vmv.s.x v12, a0 +; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 ; NODOT-NEXT: ret ; @@ -441,11 +438,10 @@ entry: define i32 @vqdotsu_vv_scalar_add(<16 x i8> %a, <16 x i8> %b, i32 %x) { ; NODOT-LABEL: vqdotsu_vv_scalar_add: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vzext.vf2 v14, v9 -; NODOT-NEXT: vwmulsu.vv v8, v12, v14 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vzext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v8, v12, v16 ; NODOT-NEXT: vmv.s.x v12, a0 ; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 @@ -472,14 +468,14 @@ entry: define i32 @vqdot_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; NODOT-LABEL: vqdot_vv_split: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vsext.vf2 v14, v9 -; NODOT-NEXT: vsext.vf2 v16, v10 -; NODOT-NEXT: vsext.vf2 v18, v11 -; NODOT-NEXT: vwmul.vv v8, v12, v14 -; NODOT-NEXT: vwmacc.vv v8, v16, v18 -; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vsext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v12, v12, v16 +; NODOT-NEXT: vsext.vf4 v16, v10 +; NODOT-NEXT: vsext.vf4 v20, v11 +; NODOT-NEXT: vmul.vv v8, v16, v20 +; NODOT-NEXT: vadd.vv v8, v12, v8 ; NODOT-NEXT: vmv.s.x v12, zero ; NODOT-NEXT: vredsum.vs v8, v8, v12 ; NODOT-NEXT: vmv.x.s a0, v8 @@ -489,10 +485,12 @@ define i32 @vqdot_vv_split(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> % ; DOT: # %bb.0: # %entry ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; DOT-NEXT: vmv.v.i v12, 0 +; DOT-NEXT: vmv.v.i v13, 0 ; DOT-NEXT: vqdot.vv v12, v8, v9 -; DOT-NEXT: vqdot.vv v12, v10, v11 -; DOT-NEXT: vmv.s.x v8, zero -; DOT-NEXT: vredsum.vs v8, v12, v8 +; DOT-NEXT: vqdot.vv v13, v10, v11 +; DOT-NEXT: vadd.vv v8, v12, v13 +; DOT-NEXT: vmv.s.x v9, zero +; DOT-NEXT: vredsum.vs v8, v8, v9 ; DOT-NEXT: vmv.x.s a0, v8 ; DOT-NEXT: ret entry: @@ -510,20 +508,19 @@ entry: define <1 x i32> @vqdot_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { ; NODOT-LABEL: vqdot_vv_partial_reduce_v1i32_v4i8: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; NODOT-NEXT: vsext.vf2 v10, v8 -; NODOT-NEXT: vsext.vf2 v8, v9 -; NODOT-NEXT: vwmul.vv v9, v10, v8 -; NODOT-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v8, v9, 3 -; NODOT-NEXT: vslidedown.vi v10, v9, 2 +; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; NODOT-NEXT: vsext.vf4 v10, v8 +; NODOT-NEXT: vsext.vf4 v8, v9 +; NODOT-NEXT: vmul.vv v8, v10, v8 +; NODOT-NEXT: vslidedown.vi v9, v8, 3 +; NODOT-NEXT: vslidedown.vi v10, v8, 2 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v8, v8, v9 +; NODOT-NEXT: vadd.vv v9, v9, v8 ; NODOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v9, v9, 1 +; NODOT-NEXT: vslidedown.vi v8, v8, 1 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v9, v9, v10 -; NODOT-NEXT: vadd.vv v8, v9, v8 +; NODOT-NEXT: vadd.vv v8, v8, v10 +; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdot_vv_partial_reduce_v1i32_v4i8: @@ -544,19 +541,21 @@ entry: define <1 x i32> @vqdotu_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { ; NODOT-LABEL: vqdotu_vv_partial_reduce_v1i32_v4i8: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; NODOT-NEXT: vwmulu.vv v10, v8, v9 +; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; NODOT-NEXT: vzext.vf2 v10, v9 +; NODOT-NEXT: vzext.vf2 v9, v8 +; NODOT-NEXT: vmul.vv v8, v9, v10 ; NODOT-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; NODOT-NEXT: vzext.vf2 v8, v10 -; NODOT-NEXT: vslidedown.vi v9, v8, 3 -; NODOT-NEXT: vslidedown.vi v10, v8, 2 +; NODOT-NEXT: vzext.vf2 v9, v8 +; NODOT-NEXT: vslidedown.vi v8, v9, 3 +; NODOT-NEXT: vslidedown.vi v10, v9, 2 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v9, v9, v8 +; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v8, v8, 1 +; NODOT-NEXT: vslidedown.vi v9, v9, 1 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v8, v8, v10 -; NODOT-NEXT: vadd.vv v8, v8, v9 +; NODOT-NEXT: vadd.vv v9, v9, v10 +; NODOT-NEXT: vadd.vv v8, v9, v8 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdotu_vv_partial_reduce_v1i32_v4i8: @@ -648,20 +647,19 @@ entry: define <1 x i32> @vqdotsu_vv_partial_reduce_v1i32_v4i8(<4 x i8> %a, <4 x i8> %b) { ; NODOT-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; NODOT-NEXT: vsext.vf2 v10, v8 -; NODOT-NEXT: vzext.vf2 v8, v9 -; NODOT-NEXT: vwmulsu.vv v9, v10, v8 -; NODOT-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v8, v9, 3 -; NODOT-NEXT: vslidedown.vi v10, v9, 2 +; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; NODOT-NEXT: vsext.vf4 v10, v8 +; NODOT-NEXT: vzext.vf4 v8, v9 +; NODOT-NEXT: vmul.vv v8, v10, v8 +; NODOT-NEXT: vslidedown.vi v9, v8, 3 +; NODOT-NEXT: vslidedown.vi v10, v8, 2 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v8, v8, v9 +; NODOT-NEXT: vadd.vv v9, v9, v8 ; NODOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v9, v9, 1 +; NODOT-NEXT: vslidedown.vi v8, v8, 1 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v9, v9, v10 -; NODOT-NEXT: vadd.vv v8, v9, v8 +; NODOT-NEXT: vadd.vv v8, v8, v10 +; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdotsu_vv_partial_reduce_v1i32_v4i8: @@ -682,20 +680,19 @@ entry: define <1 x i32> @vqdotsu_vv_partial_reduce_swapped(<4 x i8> %a, <4 x i8> %b) { ; NODOT-LABEL: vqdotsu_vv_partial_reduce_swapped: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; NODOT-NEXT: vsext.vf2 v10, v8 -; NODOT-NEXT: vzext.vf2 v8, v9 -; NODOT-NEXT: vwmulsu.vv v9, v10, v8 -; NODOT-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v8, v9, 3 -; NODOT-NEXT: vslidedown.vi v10, v9, 2 +; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; NODOT-NEXT: vsext.vf4 v10, v8 +; NODOT-NEXT: vzext.vf4 v8, v9 +; NODOT-NEXT: vmul.vv v8, v8, v10 +; NODOT-NEXT: vslidedown.vi v9, v8, 3 +; NODOT-NEXT: vslidedown.vi v10, v8, 2 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v8, v8, v9 +; NODOT-NEXT: vadd.vv v9, v9, v8 ; NODOT-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; NODOT-NEXT: vslidedown.vi v9, v9, 1 +; NODOT-NEXT: vslidedown.vi v8, v8, 1 ; NODOT-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; NODOT-NEXT: vadd.vv v9, v9, v10 -; NODOT-NEXT: vadd.vv v8, v9, v8 +; NODOT-NEXT: vadd.vv v8, v8, v10 +; NODOT-NEXT: vadd.vv v8, v8, v9 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdotsu_vv_partial_reduce_swapped: @@ -740,10 +737,10 @@ entry: define <2 x i32> @vqdot_vv_partial_reduce_v2i32_v8i8(<8 x i8> %a, <8 x i8> %b) { ; NODOT-LABEL: vqdot_vv_partial_reduce_v2i32_v8i8: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; NODOT-NEXT: vsext.vf2 v10, v8 -; NODOT-NEXT: vsext.vf2 v11, v9 -; NODOT-NEXT: vwmul.vv v8, v10, v11 +; NODOT-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; NODOT-NEXT: vsext.vf4 v10, v8 +; NODOT-NEXT: vsext.vf4 v12, v9 +; NODOT-NEXT: vmul.vv v8, v10, v12 ; NODOT-NEXT: vsetivli zero, 2, e32, m2, ta, ma ; NODOT-NEXT: vslidedown.vi v10, v8, 6 ; NODOT-NEXT: vsetivli zero, 2, e32, mf2, ta, ma @@ -783,10 +780,10 @@ define <2 x i32> @vqdot_vv_partial_reduce_v2i32_v64i8(<64 x i8> %a, <64 x i8> %b ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x05, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 5 * vlenb ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vsext.vf2 v24, v8 -; CHECK-NEXT: vsext.vf2 v28, v12 -; CHECK-NEXT: vwmul.vv v16, v24, v28 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf4 v16, v8 +; CHECK-NEXT: vsext.vf4 v24, v12 +; CHECK-NEXT: vmul.vv v16, v16, v24 ; CHECK-NEXT: vsetivli zero, 2, e32, m8, ta, ma ; CHECK-NEXT: vslidedown.vi v24, v16, 28 ; CHECK-NEXT: vslidedown.vi v0, v16, 26 @@ -819,12 +816,12 @@ define <2 x i32> @vqdot_vv_partial_reduce_v2i32_v64i8(<64 x i8> %a, <64 x i8> %b ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs1r.v v24, (a1) # vscale x 8-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vslidedown.vx v12, v12, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vsext.vf2 v24, v8 -; CHECK-NEXT: vsext.vf2 v28, v12 -; CHECK-NEXT: vwmul.vv v8, v24, v28 +; CHECK-NEXT: vslidedown.vx v24, v8, a0 +; CHECK-NEXT: vslidedown.vx v28, v12, a0 +; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; CHECK-NEXT: vsext.vf4 v8, v24 +; CHECK-NEXT: vsext.vf4 v0, v28 +; CHECK-NEXT: vmul.vv v8, v8, v0 ; CHECK-NEXT: vsetivli zero, 2, e32, m8, ta, ma ; CHECK-NEXT: vslidedown.vi v0, v8, 28 ; CHECK-NEXT: vslidedown.vi v24, v8, 26 @@ -952,10 +949,10 @@ entry: define <4 x i32> @vqdot_vv_partial_reduce_v4i32_v16i8(<16 x i8> %a, <16 x i8> %b) { ; NODOT-LABEL: vqdot_vv_partial_reduce_v4i32_v16i8: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v8 -; NODOT-NEXT: vsext.vf2 v14, v9 -; NODOT-NEXT: vwmul.vv v8, v12, v14 +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vsext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v8, v12, v16 ; NODOT-NEXT: vsetivli zero, 4, e32, m4, ta, ma ; NODOT-NEXT: vslidedown.vi v12, v8, 12 ; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma @@ -987,29 +984,45 @@ entry: define <16 x i32> @vqdot_vv_partial_reduce_v16i32_v64i8(<64 x i8> %a, <64 x i8> %b) { ; NODOT-LABEL: vqdot_vv_partial_reduce_v16i32_v64i8: ; NODOT: # %bb.0: # %entry +; NODOT-NEXT: addi sp, sp, -16 +; NODOT-NEXT: .cfi_def_cfa_offset 16 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: sub sp, sp, a0 +; NODOT-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; NODOT-NEXT: li a0, 32 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v16, v8 +; NODOT-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v16, v8 +; NODOT-NEXT: addi a1, sp, 16 +; NODOT-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v8, v8, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v20, v12 +; NODOT-NEXT: vslidedown.vx v4, v8, a0 +; NODOT-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v16, v12 ; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v12, v12, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v24, v8 -; NODOT-NEXT: vsext.vf2 v28, v12 -; NODOT-NEXT: vwmul.vv v8, v16, v20 -; NODOT-NEXT: vwmul.vv v16, v24, v28 +; NODOT-NEXT: vslidedown.vx v0, v12, a0 +; NODOT-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v24, v4 +; NODOT-NEXT: vsext.vf4 v8, v0 +; NODOT-NEXT: addi a0, sp, 16 +; NODOT-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vmul.vv v16, v0, v16 +; NODOT-NEXT: vmul.vv v8, v24, v8 ; NODOT-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; NODOT-NEXT: vslidedown.vi v24, v8, 16 +; NODOT-NEXT: vslidedown.vi v24, v16, 16 ; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; NODOT-NEXT: vadd.vv v8, v24, v8 -; NODOT-NEXT: vadd.vv v24, v8, v16 +; NODOT-NEXT: vadd.vv v16, v24, v16 +; NODOT-NEXT: vadd.vv v16, v16, v8 ; NODOT-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; NODOT-NEXT: vslidedown.vi v8, v16, 16 +; NODOT-NEXT: vslidedown.vi v8, v8, 16 ; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; NODOT-NEXT: vadd.vv v8, v8, v24 +; NODOT-NEXT: vadd.vv v8, v8, v16 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: add sp, sp, a0 +; NODOT-NEXT: .cfi_def_cfa sp, 16 +; NODOT-NEXT: addi sp, sp, 16 +; NODOT-NEXT: .cfi_def_cfa_offset 0 ; NODOT-NEXT: ret ; ; DOT-LABEL: vqdot_vv_partial_reduce_v16i32_v64i8: @@ -1030,10 +1043,10 @@ entry: define <4 x i32> @vqdot_vv_partial_reduce_m1_accum(<16 x i8> %a, <16 x i8> %b, <4 x i32> %accum) { ; NODOT-LABEL: vqdot_vv_partial_reduce_m1_accum: ; NODOT: # %bb.0: # %entry -; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; NODOT-NEXT: vsext.vf2 v16, v8 -; NODOT-NEXT: vsext.vf2 v18, v9 -; NODOT-NEXT: vwmul.vv v12, v16, v18 +; NODOT-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; NODOT-NEXT: vsext.vf4 v12, v8 +; NODOT-NEXT: vsext.vf4 v16, v9 +; NODOT-NEXT: vmul.vv v12, v12, v16 ; NODOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; NODOT-NEXT: vadd.vv v16, v10, v12 ; NODOT-NEXT: vsetivli zero, 4, e32, m4, ta, ma @@ -1066,10 +1079,10 @@ entry: define <16 x i32> @vqdot_vv_partial_reduce3(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: vqdot_vv_partial_reduce3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; CHECK-NEXT: vsext.vf2 v12, v8 -; CHECK-NEXT: vsext.vf2 v14, v9 -; CHECK-NEXT: vwmul.vv v8, v12, v14 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vsext.vf4 v12, v8 +; CHECK-NEXT: vsext.vf4 v16, v9 +; CHECK-NEXT: vmul.vv v8, v12, v16 ; CHECK-NEXT: ret entry: %a.sext = sext <16 x i8> %a to <16 x i32> @@ -1088,210 +1101,301 @@ define <64 x i32> @vqdotsu_vv_partial_v64i32_v256i8(<256 x i8> %a, <256 x i8> %b ; NODOT-NEXT: csrr a1, vlenb ; NODOT-NEXT: slli a1, a1, 3 ; NODOT-NEXT: mv a2, a1 -; NODOT-NEXT: slli a1, a1, 2 +; NODOT-NEXT: slli a1, a1, 3 ; NODOT-NEXT: add a1, a1, a2 ; NODOT-NEXT: sub sp, sp, a1 -; NODOT-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; NODOT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 72 * vlenb ; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 4 +; NODOT-NEXT: slli a1, a1, 3 +; NODOT-NEXT: mv a2, a1 +; NODOT-NEXT: slli a1, a1, 1 +; NODOT-NEXT: add a1, a1, a2 ; NODOT-NEXT: add a1, sp, a1 ; NODOT-NEXT: addi a1, a1, 16 ; NODOT-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 5 +; NODOT-NEXT: slli a1, a1, 6 ; NODOT-NEXT: add a1, sp, a1 ; NODOT-NEXT: addi a1, a1, 16 ; NODOT-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; NODOT-NEXT: addi a1, a0, 128 ; NODOT-NEXT: li a2, 128 ; NODOT-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; NODOT-NEXT: vle8.v v0, (a0) +; NODOT-NEXT: vle8.v v0, (a1) +; NODOT-NEXT: csrr a1, vlenb +; NODOT-NEXT: slli a1, a1, 3 +; NODOT-NEXT: mv a2, a1 +; NODOT-NEXT: slli a1, a1, 2 +; NODOT-NEXT: add a1, a1, a2 +; NODOT-NEXT: add a1, sp, a1 +; NODOT-NEXT: addi a1, a1, 16 +; NODOT-NEXT: vs8r.v v0, (a1) # vscale x 64-byte Folded Spill +; NODOT-NEXT: li a1, 32 +; NODOT-NEXT: vle8.v v8, (a0) ; NODOT-NEXT: csrr a0, vlenb ; NODOT-NEXT: slli a0, a0, 3 -; NODOT-NEXT: mv a3, a0 +; NODOT-NEXT: mv a2, a0 ; NODOT-NEXT: slli a0, a0, 1 -; NODOT-NEXT: add a0, a0, a3 +; NODOT-NEXT: add a2, a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a2 ; NODOT-NEXT: add a0, sp, a0 ; NODOT-NEXT: addi a0, a0, 16 -; NODOT-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill -; NODOT-NEXT: li a0, 32 -; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v24, v8, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v8, v24 -; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v12, v0, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vzext.vf2 v4, v12 -; NODOT-NEXT: vwmulsu.vv v24, v8, v4 -; NODOT-NEXT: csrr a3, vlenb -; NODOT-NEXT: slli a3, a3, 5 -; NODOT-NEXT: add a3, sp, a3 -; NODOT-NEXT: addi a3, a3, 16 -; NODOT-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vsext.vf2 v4, v8 -; NODOT-NEXT: csrr a3, vlenb -; NODOT-NEXT: slli a3, a3, 3 -; NODOT-NEXT: mv a4, a3 -; NODOT-NEXT: slli a3, a3, 1 -; NODOT-NEXT: add a3, a3, a4 -; NODOT-NEXT: add a3, sp, a3 -; NODOT-NEXT: addi a3, a3, 16 -; NODOT-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vzext.vf2 v0, v8 -; NODOT-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; NODOT-NEXT: vle8.v v8, (a1) -; NODOT-NEXT: addi a1, sp, 16 -; NODOT-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vwmaccsu.vv v24, v4, v0 -; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v4, v16, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v4 -; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v4, v8, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vzext.vf2 v16, v4 -; NODOT-NEXT: vwmulsu.vv v0, v12, v16 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 4 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vsext.vf2 v12, v16 -; NODOT-NEXT: vzext.vf2 v20, v8 -; NODOT-NEXT: vwmaccsu.vv v0, v12, v20 -; NODOT-NEXT: li a1, 64 +; NODOT-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v8, v16 +; NODOT-NEXT: vzext.vf4 v24, v0 +; NODOT-NEXT: vmul.vv v8, v24, v8 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 5 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 6 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vsext.vf4 v0, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a2, a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a2 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vzext.vf4 v24, v8 +; NODOT-NEXT: vmul.vv v8, v24, v0 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 4 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; NODOT-NEXT: li a0, 64 ; NODOT-NEXT: csrr a2, vlenb -; NODOT-NEXT: slli a2, a2, 5 +; NODOT-NEXT: slli a2, a2, 3 +; NODOT-NEXT: mv a3, a2 +; NODOT-NEXT: slli a2, a2, 1 +; NODOT-NEXT: add a2, a2, a3 +; NODOT-NEXT: add a2, sp, a2 +; NODOT-NEXT: addi a2, a2, 16 +; NODOT-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; NODOT-NEXT: vslidedown.vx v16, v8, a0 +; NODOT-NEXT: addi a2, sp, 16 +; NODOT-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v0, v16 +; NODOT-NEXT: csrr a2, vlenb +; NODOT-NEXT: slli a2, a2, 3 +; NODOT-NEXT: mv a3, a2 +; NODOT-NEXT: slli a2, a2, 2 +; NODOT-NEXT: add a2, a2, a3 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 ; NODOT-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; NODOT-NEXT: vslidedown.vx v8, v16, a1 +; NODOT-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; NODOT-NEXT: vslidedown.vx v16, v16, a0 ; NODOT-NEXT: csrr a2, vlenb -; NODOT-NEXT: slli a2, a2, 5 +; NODOT-NEXT: slli a2, a2, 4 +; NODOT-NEXT: mv a3, a2 +; NODOT-NEXT: slli a2, a2, 1 +; NODOT-NEXT: add a2, a2, a3 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 -; NODOT-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; NODOT-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; NODOT-NEXT: csrr a2, vlenb -; NODOT-NEXT: slli a2, a2, 3 +; NODOT-NEXT: slli a2, a2, 4 ; NODOT-NEXT: mv a3, a2 ; NODOT-NEXT: slli a2, a2, 1 ; NODOT-NEXT: add a2, a2, a3 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 ; NODOT-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vslidedown.vx v8, v16, a1 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vzext.vf4 v24, v16 +; NODOT-NEXT: vmul.vv v24, v24, v0 ; NODOT-NEXT: csrr a2, vlenb ; NODOT-NEXT: slli a2, a2, 3 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 -; NODOT-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; NODOT-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill +; NODOT-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; NODOT-NEXT: vslidedown.vx v8, v8, a1 ; NODOT-NEXT: csrr a2, vlenb -; NODOT-NEXT: slli a2, a2, 5 +; NODOT-NEXT: slli a2, a2, 3 +; NODOT-NEXT: mv a3, a2 +; NODOT-NEXT: slli a2, a2, 2 +; NODOT-NEXT: add a2, a2, a3 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 -; NODOT-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v16, v8 +; NODOT-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vslidedown.vx v24, v16, a1 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v0, v8 +; NODOT-NEXT: vzext.vf4 v16, v24 +; NODOT-NEXT: vmul.vv v8, v16, v0 ; NODOT-NEXT: csrr a2, vlenb ; NODOT-NEXT: slli a2, a2, 3 +; NODOT-NEXT: mv a3, a2 +; NODOT-NEXT: slli a2, a2, 2 +; NODOT-NEXT: add a2, a2, a3 +; NODOT-NEXT: add a2, sp, a2 +; NODOT-NEXT: addi a2, a2, 16 +; NODOT-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; NODOT-NEXT: csrr a2, vlenb +; NODOT-NEXT: slli a2, a2, 6 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 ; NODOT-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vzext.vf2 v20, v8 -; NODOT-NEXT: vwmaccsu.vv v24, v16, v20 +; NODOT-NEXT: vsetvli zero, a0, e8, m8, ta, ma +; NODOT-NEXT: vslidedown.vx v16, v8, a0 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v24, v16 ; NODOT-NEXT: csrr a2, vlenb -; NODOT-NEXT: slli a2, a2, 4 +; NODOT-NEXT: slli a2, a2, 3 +; NODOT-NEXT: mv a3, a2 +; NODOT-NEXT: slli a2, a2, 1 +; NODOT-NEXT: add a3, a3, a2 +; NODOT-NEXT: slli a2, a2, 1 +; NODOT-NEXT: add a2, a2, a3 ; NODOT-NEXT: add a2, sp, a2 ; NODOT-NEXT: addi a2, a2, 16 -; NODOT-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; NODOT-NEXT: vslidedown.vx v16, v16, a1 -; NODOT-NEXT: addi a2, sp, 16 ; NODOT-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vslidedown.vx v8, v8, a1 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 3 -; NODOT-NEXT: mv a2, a1 -; NODOT-NEXT: slli a1, a1, 1 -; NODOT-NEXT: add a1, a1, a2 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v8, v16 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 4 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 3 -; NODOT-NEXT: mv a2, a1 -; NODOT-NEXT: slli a1, a1, 1 -; NODOT-NEXT: add a1, a1, a2 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vzext.vf2 v20, v8 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 4 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; NODOT-NEXT: vwmaccsu.vv v0, v8, v20 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 5 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v20, v8, a0 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 3 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; NODOT-NEXT: vslidedown.vx v8, v8, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v12, v20 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 5 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vs4r.v v12, (a1) # vscale x 32-byte Folded Spill -; NODOT-NEXT: vzext.vf2 v12, v8 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 5 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; NODOT-NEXT: vwmaccsu.vv v24, v8, v12 -; NODOT-NEXT: vsetvli zero, a0, e8, m4, ta, ma -; NODOT-NEXT: vslidedown.vx v12, v16, a0 -; NODOT-NEXT: csrr a1, vlenb -; NODOT-NEXT: slli a1, a1, 3 -; NODOT-NEXT: mv a2, a1 -; NODOT-NEXT: slli a1, a1, 1 -; NODOT-NEXT: add a1, a1, a2 -; NODOT-NEXT: add a1, sp, a1 -; NODOT-NEXT: addi a1, a1, 16 -; NODOT-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload -; NODOT-NEXT: vslidedown.vx v8, v16, a0 -; NODOT-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; NODOT-NEXT: vsext.vf2 v16, v12 -; NODOT-NEXT: vzext.vf2 v12, v8 -; NODOT-NEXT: vwmaccsu.vv v0, v16, v12 -; NODOT-NEXT: vmv8r.v v8, v24 -; NODOT-NEXT: vmv8r.v v16, v0 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vzext.vf4 v0, v8 +; NODOT-NEXT: vmul.vv v24, v0, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a2 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 6 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; NODOT-NEXT: vslidedown.vx v12, v24, a1 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a2, a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a2 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vslidedown.vx v20, v24, a1 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v24, v12 +; NODOT-NEXT: vzext.vf4 v0, v20 +; NODOT-NEXT: vmul.vv v24, v0, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a2, a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a2 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; NODOT-NEXT: addi a0, sp, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; NODOT-NEXT: vslidedown.vx v12, v24, a1 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 4 +; NODOT-NEXT: mv a2, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a2 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vslidedown.vx v20, v24, a1 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v24, v12 +; NODOT-NEXT: vzext.vf4 v0, v20 +; NODOT-NEXT: vmul.vv v24, v0, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 6 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; NODOT-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; NODOT-NEXT: vslidedown.vx v16, v16, a1 +; NODOT-NEXT: vslidedown.vx v4, v8, a1 +; NODOT-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; NODOT-NEXT: vsext.vf4 v8, v16 +; NODOT-NEXT: vzext.vf4 v16, v4 +; NODOT-NEXT: vmul.vv v8, v16, v8 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 4 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a1, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a1, a1, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a1 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vadd.vv v16, v16, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 5 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; NODOT-NEXT: csrr a0, vlenb ; NODOT-NEXT: slli a0, a0, 3 ; NODOT-NEXT: mv a1, a0 ; NODOT-NEXT: slli a0, a0, 2 ; NODOT-NEXT: add a0, a0, a1 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vadd.vv v0, v0, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a1, a0 +; NODOT-NEXT: slli a0, a0, 1 +; NODOT-NEXT: add a0, a0, a1 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vadd.vv v16, v16, v24 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vadd.vv v0, v0, v24 +; NODOT-NEXT: vadd.vv v8, v16, v8 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 6 +; NODOT-NEXT: add a0, sp, a0 +; NODOT-NEXT: addi a0, a0, 16 +; NODOT-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; NODOT-NEXT: vadd.vv v16, v0, v16 +; NODOT-NEXT: csrr a0, vlenb +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: mv a1, a0 +; NODOT-NEXT: slli a0, a0, 3 +; NODOT-NEXT: add a0, a0, a1 ; NODOT-NEXT: add sp, sp, a0 ; NODOT-NEXT: .cfi_def_cfa sp, 16 ; NODOT-NEXT: addi sp, sp, 16 @@ -1428,14 +1532,13 @@ entry: define <1 x i32> @vqdotsu_vv_partial_v1i32_v2i8(<2 x i8> %a, <2 x i8> %b) { ; CHECK-LABEL: vqdotsu_vv_partial_v1i32_v2i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vzext.vf2 v8, v9 -; CHECK-NEXT: vwmulsu.vv v9, v10, v8 -; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-NEXT: vslidedown.vi v8, v9, 1 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vzext.vf4 v8, v9 +; CHECK-NEXT: vmul.vv v8, v8, v10 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret entry: %a.ext = sext <2 x i8> %a to <2 x i32> @@ -1451,11 +1554,10 @@ entry: define <1 x i32> @vqdotsu_vv_partial_v1i32_v8i8(<8 x i8> %a, <8 x i8> %b) { ; CHECK-LABEL: vqdotsu_vv_partial_v1i32_v8i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vsext.vf2 v10, v8 -; CHECK-NEXT: vzext.vf2 v11, v9 -; CHECK-NEXT: vwmulsu.vv v8, v10, v11 -; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf4 v10, v8 +; CHECK-NEXT: vzext.vf4 v12, v9 +; CHECK-NEXT: vmul.vv v8, v12, v10 ; CHECK-NEXT: vslidedown.vi v10, v8, 6 ; CHECK-NEXT: vslidedown.vi v12, v8, 5 ; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index f9ac53b76ebaf..b2f1b7320ee45 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -113,9 +113,12 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-V-NEXT: li a0, -1 +; CHECK-V-NEXT: srli a0, a0, 32 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -304,10 +307,13 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-V-NEXT: li a0, -1 +; CHECK-V-NEXT: srli a0, a0, 32 ; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vmax.vx v10, v10, zero +; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -850,20 +856,23 @@ define <4 x i32> @ustest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: csrr a1, vlenb +; CHECK-V-NEXT: add a1, sp, a1 +; CHECK-V-NEXT: addi a1, a1, 16 +; CHECK-V-NEXT: vl2r.v v10, (a1) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-V-NEXT: li a0, -1 +; CHECK-V-NEXT: addi a1, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vslideup.vi v8, v10, 2 +; CHECK-V-NEXT: srli a0, a0, 32 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a1, a0, 1 ; CHECK-V-NEXT: add a0, a1, a0 @@ -1001,9 +1010,12 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.x.f.w v9, v8 -; CHECK-V-NEXT: vmax.vx v8, v9, zero +; CHECK-V-NEXT: lui a0, 16 +; CHECK-V-NEXT: addi a0, a0, -1 +; CHECK-V-NEXT: vmin.vx v8, v9, a0 +; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -1196,9 +1208,12 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-V-NEXT: lui a0, 16 +; CHECK-V-NEXT: addi a0, a0, -1 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -2247,27 +2262,30 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: csrr a1, vlenb +; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: add a1, sp, a1 +; CHECK-V-NEXT: addi a1, a1, 16 +; CHECK-V-NEXT: vl2r.v v10, (a1) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-V-NEXT: lui a0, 16 +; CHECK-V-NEXT: addi a1, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-V-NEXT: csrr a1, vlenb +; CHECK-V-NEXT: add a1, sp, a1 +; CHECK-V-NEXT: addi a1, a1, 16 +; CHECK-V-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vslideup.vi v8, v9, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-V-NEXT: vslideup.vi v8, v10, 4 +; CHECK-V-NEXT: addi a0, a0, -1 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 @@ -3677,9 +3695,12 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-V-NEXT: li a0, -1 +; CHECK-V-NEXT: srli a0, a0, 32 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -3863,10 +3884,13 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-V-NEXT: li a0, -1 +; CHECK-V-NEXT: srli a0, a0, 32 ; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vmax.vx v10, v10, zero +; CHECK-V-NEXT: vmin.vx v8, v10, a0 +; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -4404,20 +4428,23 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: csrr a1, vlenb +; CHECK-V-NEXT: add a1, sp, a1 +; CHECK-V-NEXT: addi a1, a1, 16 +; CHECK-V-NEXT: vl2r.v v10, (a1) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-V-NEXT: li a0, -1 +; CHECK-V-NEXT: addi a1, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-V-NEXT: vslideup.vi v8, v10, 2 +; CHECK-V-NEXT: srli a0, a0, 32 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a1, a0, 1 ; CHECK-V-NEXT: add a0, a1, a0 @@ -4550,9 +4577,12 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.x.f.w v9, v8 -; CHECK-V-NEXT: vmax.vx v8, v9, zero +; CHECK-V-NEXT: lui a0, 16 +; CHECK-V-NEXT: addi a0, a0, -1 +; CHECK-V-NEXT: vmin.vx v8, v9, a0 +; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -4740,9 +4770,12 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-V-NEXT: lui a0, 16 +; CHECK-V-NEXT: addi a0, a0, -1 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v8, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -5786,27 +5819,30 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill ; CHECK-V-NEXT: call __extendhfsf2 ; CHECK-V-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-V-NEXT: csrr a1, vlenb +; CHECK-V-NEXT: slli a1, a1, 1 +; CHECK-V-NEXT: add a1, sp, a1 +; CHECK-V-NEXT: addi a1, a1, 16 +; CHECK-V-NEXT: vl2r.v v10, (a1) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vmv.s.x v8, a0 -; CHECK-V-NEXT: addi a0, sp, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-V-NEXT: lui a0, 16 +; CHECK-V-NEXT: addi a1, sp, 16 +; CHECK-V-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload ; CHECK-V-NEXT: vslideup.vi v8, v9, 1 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl1r.v v9, (a0) # vscale x 8-byte Folded Reload +; CHECK-V-NEXT: csrr a1, vlenb +; CHECK-V-NEXT: add a1, sp, a1 +; CHECK-V-NEXT: addi a1, a1, 16 +; CHECK-V-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vslideup.vi v8, v9, 2 -; CHECK-V-NEXT: csrr a0, vlenb -; CHECK-V-NEXT: slli a0, a0, 1 -; CHECK-V-NEXT: add a0, sp, a0 -; CHECK-V-NEXT: addi a0, a0, 16 -; CHECK-V-NEXT: vl2r.v v10, (a0) # vscale x 16-byte Folded Reload ; CHECK-V-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-V-NEXT: vslideup.vi v8, v10, 4 +; CHECK-V-NEXT: addi a0, a0, -1 +; CHECK-V-NEXT: vmin.vx v8, v8, a0 ; CHECK-V-NEXT: vmax.vx v10, v8, zero ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index 3633885bfa7d2..ba0d966d6ef99 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -2386,11 +2386,12 @@ define <4 x i32> @scalar_prefix(ptr %base, i32 signext %index, <4 x i32> %vecidx ; ; RV64-LABEL: scalar_prefix: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 4 ; RV64-NEXT: slli a1, a1, 10 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vwmulsu.vx v10, v8, a2 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsext.vf2 v10, v8 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsll.vi v10, v10, 2 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, i32 %index, <4 x i32> %vecidx @@ -2412,14 +2413,16 @@ define <4 x i32> @scalar_prefix_with_splat(ptr %base, i32 %index, <4 x i32> %vec ; ; RV64-LABEL: scalar_prefix_with_splat: ; RV64: # %bb.0: -; RV64-NEXT: li a2, 1024 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vsext.vf2 v10, v8 +; RV64-NEXT: vsext.vf2 v8, v12 +; RV64-NEXT: vsll.vi v8, v8, 10 +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 2 +; RV64-NEXT: vadd.vv v10, v8, v10 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vmv.v.x v9, a2 -; RV64-NEXT: vwmaccsu.vx v10, a1, v9 -; RV64-NEXT: li a0, 4 -; RV64-NEXT: vwmaccus.vx v10, a0, v8 ; RV64-NEXT: vluxei64.v v8, (zero), v10 ; RV64-NEXT: ret %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %index, i32 0 @@ -2442,11 +2445,12 @@ define <4 x i32> @scalar_prefix_with_constant_splat(ptr %base, <4 x i32> %vecidx ; ; RV64-LABEL: scalar_prefix_with_constant_splat: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 4 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vwmulsu.vx v10, v8, a1 ; RV64-NEXT: lui a1, 5 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsext.vf2 v10, v8 ; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: vsll.vi v10, v10, 2 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, <4 x i32> splat (i32 20), <4 x i32> %vecidx @@ -2469,12 +2473,16 @@ define <4 x i32> @reassociate(ptr %base, i32 %index, <4 x i32> %vecidx) { ; RV64-LABEL: reassociate: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: li a0, 1024 +; RV64-NEXT: vsext.vf2 v10, v8 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vsll.vi v8, v10, 10 +; RV64-NEXT: vsext.vf2 v10, v12 +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 2 +; RV64-NEXT: vadd.vv v10, v8, v10 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vwmaccus.vx v10, a0, v8 -; RV64-NEXT: vmv.v.i v8, 4 -; RV64-NEXT: vwmaccsu.vx v10, a1, v8 ; RV64-NEXT: vluxei64.v v8, (zero), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, <4 x i32> %vecidx, i32 %index @@ -2496,13 +2504,16 @@ define <4 x i32> @reassociate_with_splat(ptr %base, i32 %index, <4 x i32> %vecid ; ; RV64-LABEL: reassociate_with_splat: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; RV64-NEXT: vmv.v.x v10, a0 -; RV64-NEXT: li a0, 1024 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.v.x v12, a1 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vsext.vf2 v10, v8 +; RV64-NEXT: vsll.vi v8, v10, 10 +; RV64-NEXT: vsext.vf2 v10, v12 +; RV64-NEXT: vadd.vx v8, v8, a0 +; RV64-NEXT: vsll.vi v10, v10, 2 +; RV64-NEXT: vadd.vv v10, v8, v10 ; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; RV64-NEXT: vwmaccus.vx v10, a0, v8 -; RV64-NEXT: vmv.v.i v8, 4 -; RV64-NEXT: vwmaccsu.vx v10, a1, v8 ; RV64-NEXT: vluxei64.v v8, (zero), v10 ; RV64-NEXT: ret %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %index, i32 0 @@ -2524,10 +2535,11 @@ define <4 x i32> @reassociate_with_constant_splat(ptr %base, i32 %index, <4 x i3 ; ; RV64-LABEL: reassociate_with_constant_splat: ; RV64: # %bb.0: -; RV64-NEXT: li a1, 1024 -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vwmulsu.vx v10, v8, a1 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsext.vf2 v10, v8 +; RV64-NEXT: vsll.vi v10, v10, 10 ; RV64-NEXT: addi a0, a0, 80 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV64-NEXT: vluxei64.v v8, (a0), v10 ; RV64-NEXT: ret %gep = getelementptr [256 x i32], ptr %base, <4 x i32> %vecidx, <4 x i32> splat (i32 20) diff --git a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll index 03b090def5119..b6a74dea7b5a7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sshl_sat_vec.ll @@ -64,9 +64,9 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind { define <16 x i8> @vec_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind { ; CHECK-LABEL: vec_v16i8: ; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; CHECK-NEXT: vmsle.vi v0, v8, -1 -; CHECK-NEXT: li a0, 127 ; CHECK-NEXT: vmv.v.x v10, a0 ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vmerge.vxm v10, v10, a0, v0 diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll index d2f73826e4e9e..9267e31f3950e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/trunc-select-to-max-usat.ll @@ -7,8 +7,10 @@ define <4 x i8> @test_v4i16_v4i8(<4 x i16> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i16> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i16> @@ -23,10 +25,12 @@ define <4 x i8> @test_v4i32_v4i8(<4 x i32> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i32> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i32> @@ -41,12 +45,14 @@ define <4 x i8> @test_v4i64_v4i8(<4 x i64> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, 255 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i64> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i64> @@ -61,8 +67,11 @@ define <4 x i16> @test_v4i32_v4i16(<4 x i32> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i32> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i32> @@ -77,10 +86,13 @@ define <4 x i16> @test_v4i64_v4i16(<4 x i64> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v10, v8, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i64> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i64> @@ -94,9 +106,12 @@ define <4 x i32> @test_v4i64_v4i32(<4 x i64> %x) { ; CHECK-LABEL: test_v4i64_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vmin.vx v10, v8, a0 ; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-NEXT: vnclipu.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 ; CHECK-NEXT: ret %a = icmp sgt <4 x i64> %x, zeroinitializer %b = sext <4 x i1> %a to <4 x i64> diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 4753ab915bdf3..403577b1a5f8b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -142,18 +142,18 @@ define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec) ; V-LABEL: vector_deinterleave_v8i64_v16i64: ; V: # %bb.0: ; V-NEXT: li a0, 85 -; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; V-NEXT: vmv.v.i v0, -16 -; V-NEXT: vid.v v16 ; V-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; V-NEXT: vslidedown.vi v24, v8, 8 -; V-NEXT: vmv.s.x v12, a0 -; V-NEXT: li a0, 170 ; V-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; V-NEXT: vadd.vv v20, v16, v16 +; V-NEXT: vid.v v12 +; V-NEXT: vmv.s.x v13, a0 +; V-NEXT: li a0, 170 +; V-NEXT: vadd.vv v20, v12, v12 ; V-NEXT: vmv.s.x v21, a0 ; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; V-NEXT: vcompress.vm v16, v8, v12 +; V-NEXT: vcompress.vm v16, v8, v13 ; V-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; V-NEXT: vadd.vi v22, v20, -8 ; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 6144f916ea52b..338c334be44a5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -1063,35 +1063,34 @@ define {, , , @vdot_lane_s32(<2 x i32> noundef %var_1, <8 x i8> noundef %var_ ; CHECK-NEXT: vnsrl.wi v8, v11, 0 ; CHECK-NEXT: vnsrl.wi v9, v11, 16 ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vwadd.vv v10, v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; CHECK-NEXT: vsext.vf2 v10, v8 +; CHECK-NEXT: vsext.vf2 v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v8 ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v10, 0 -; CHECK-NEXT: vnsrl.wx v9, v10, a0 -; CHECK-NEXT: vadd.vv v8, v8, v9 +; CHECK-NEXT: vnsrl.wi v9, v8, 0 +; CHECK-NEXT: vnsrl.wx v8, v8, a0 +; CHECK-NEXT: vadd.vv v8, v9, v8 ; CHECK-NEXT: ret entry: %a = shufflevector <8 x i16> %x, <8 x i16> poison, <4 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 4dd9173e2d418..38e42c137e3a9 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -663,8 +663,8 @@ define void @vld3_v2i8(ptr %src, ptr %dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .pad #8 ; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: ldrd r0, r2, [r0] -; CHECK-NEXT: strd r0, r2, [sp] +; CHECK-NEXT: ldrd r2, r0, [r0] +; CHECK-NEXT: strd r2, r0, [sp] ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vldrb.u16 q0, [r0] ; CHECK-NEXT: vmov.u16 r0, q0[4] diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index dec829fed3535..106a3633809f1 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -967,9 +967,9 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -979,8 +979,8 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpaddb (%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -990,8 +990,8 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -1634,8 +1634,8 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1752,8 +1752,8 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1867,8 +1867,8 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2025,9 +2025,9 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2177,9 +2177,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2329,9 +2329,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3510,10 +3510,12 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3638,10 +3640,12 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3760,11 +3764,13 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -3890,10 +3896,12 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 +; AVX512BW-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-NEXT: vzeroupper @@ -4013,8 +4021,8 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm1 @@ -4126,8 +4134,8 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -4380,10 +4388,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,14,0,0,0,0,0,0,0,1,1,0,0,0,0] -; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] +; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -4393,9 +4403,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0] -; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,14,0,0,0,0,0,0,16,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: vpermi2d %zmm0, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4447,21 +4457,21 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rsi), %xmm2 +; AVX-NEXT: vpaddb (%rdi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4526,9 +4536,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -4633,8 +4643,8 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; ; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15] ; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -4701,21 +4711,21 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rsi), %xmm2 +; AVX-NEXT: vpaddb (%rdi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -4780,9 +4790,9 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1 ; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 23b46ee59154f..79aa36b1a0a41 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -1943,349 +1943,405 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL-LABEL: test17: ; KNL: ## %bb.0: ; KNL-NEXT: movq %rdi, %rax +; KNL-NEXT: movw $-3, %di +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: korw %k1, %k0, %k0 -; KNL-NEXT: movw $-5, %di -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kandw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k2, %k1 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 ; KNL-NEXT: korw %k2, %k0, %k0 -; KNL-NEXT: movw $-9, %di +; KNL-NEXT: movw $-5, %di ; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: korw %k3, %k0, %k0 -; KNL-NEXT: movw $-17, %di -; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: kshiftrw $13, %k4, %k4 ; KNL-NEXT: korw %k4, %k0, %k0 -; KNL-NEXT: movw $-33, %di +; KNL-NEXT: movw $-9, %di ; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kandw %k4, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: kshiftrw $12, %k5, %k5 ; KNL-NEXT: korw %k5, %k0, %k0 -; KNL-NEXT: movw $-65, %di +; KNL-NEXT: movw $-17, %di ; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 +; KNL-NEXT: kshiftrw $11, %k6, %k6 ; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movw $-33, %di +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kandw %k3, %k0, %k0 +; KNL-NEXT: kmovw %k3, %k6 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movw $-65, %di +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kandw %k3, %k0, %k0 +; KNL-NEXT: kmovw %k3, %k7 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi ; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $14, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k2, %k0, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload ; KNL-NEXT: kandw %k3, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k6, %k0, %k0 +; KNL-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k7, %k0, %k0 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: kmovw %r10d, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: korw %k0, %k6, %k0 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k6, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k7, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: kmovw %r10d, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: korw %k0, %k6, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload +; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: kandw %k7, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; KNL-NEXT: kandw %k5, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: kmovw %r10d, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: korw %k0, %k6, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kmovw %k6, %k2 +; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k2, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; KNL-NEXT: kandw %k4, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k3, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload +; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k7, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 +; KNL-NEXT: kmovw %k5, %k7 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: kmovw %r10d, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: korw %k0, %k6, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k5, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k6, %k0, %k0 +; KNL-NEXT: kmovw %k6, %k5 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw %k1, %k6 +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k7, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; KNL-NEXT: andl $1, %edi -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d -; KNL-NEXT: kmovw %r10d, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: korw %k0, %k6, %k0 +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k2, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k3, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 -; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: kandw %k7, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edi -; KNL-NEXT: kmovw %edi, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: andl $1, %esi -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kmovw %esi, %k7 -; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: kmovw %edx, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k1, %k0, %k0 -; KNL-NEXT: kmovw %ecx, %k7 -; KNL-NEXT: kshiftlw $15, %k7, %k7 -; KNL-NEXT: kshiftrw $13, %k7, %k7 -; KNL-NEXT: korw %k7, %k0, %k0 -; KNL-NEXT: kandw %k2, %k0, %k0 -; KNL-NEXT: kmovw %r8d, %k7 -; KNL-NEXT: kshiftlw $15, %k7, %k7 -; KNL-NEXT: kshiftrw $12, %k7, %k7 -; KNL-NEXT: korw %k7, %k0, %k0 -; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: kmovw %r9d, %k7 -; KNL-NEXT: kshiftlw $15, %k7, %k7 -; KNL-NEXT: kshiftrw $11, %k7, %k7 -; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload ; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k6, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: kandw %k7, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k7 -; KNL-NEXT: kshiftlw $15, %k7, %k7 -; KNL-NEXT: kshiftrw $10, %k7, %k7 -; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k7 -; KNL-NEXT: kshiftlw $15, %k7, %k7 -; KNL-NEXT: kshiftrw $9, %k7, %k7 -; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $9, %k3, %k3 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; KNL-NEXT: andl $1, %ecx -; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %edx -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kshiftlw $15, %k7, %k7 -; KNL-NEXT: kshiftrw $14, %k7, %k7 -; KNL-NEXT: kmovw %ecx, %k6 -; KNL-NEXT: korw %k7, %k6, %k6 -; KNL-NEXT: kandw %k1, %k6, %k1 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: korw %k3, %k2, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: kandw %k2, %k1, %k1 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx -; KNL-NEXT: kmovw %ecx, %k6 -; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $13, %k6, %k6 -; KNL-NEXT: korw %k6, %k1, %k1 +; KNL-NEXT: kmovw %ecx, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k3, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload ; KNL-NEXT: kandw %k2, %k1, %k1 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kandw %k4, %k1, %k1 +; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 ; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kandw %k3, %k1, %k1 +; KNL-NEXT: kandw %k6, %k1, %k1 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 ; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kandw %k4, %k1, %k1 +; KNL-NEXT: kandw %k7, %k1, %k1 ; KNL-NEXT: movzbl {{[0-9]+}}(%rsp), %ecx ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 @@ -2718,361 +2774,409 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-LABEL: test17: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: pushl %ebx -; KNL_X32-NEXT: subl $16, %esp +; KNL_X32-NEXT: subl $28, %esp +; KNL_X32-NEXT: movw $-3, %ax +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andl $1, %eax ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: korw %k1, %k0, %k0 -; KNL_X32-NEXT: movw $-5, %ax -; KNL_X32-NEXT: kmovw %eax, %k1 -; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 +; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 +; KNL_X32-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: kandw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k2, %k1 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k0, %k0 -; KNL_X32-NEXT: movw $-9, %ax +; KNL_X32-NEXT: movw $-5, %ax ; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kmovw %k2, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 -; KNL_X32-NEXT: korw %k3, %k0, %k0 -; KNL_X32-NEXT: movw $-17, %ax -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kandw %k3, %k0, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $13, %k4, %k4 ; KNL_X32-NEXT: korw %k4, %k0, %k0 -; KNL_X32-NEXT: movw $-33, %ax +; KNL_X32-NEXT: movw $-9, %ax ; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 ; KNL_X32-NEXT: korw %k5, %k0, %k0 -; KNL_X32-NEXT: movw $-65, %ax +; KNL_X32-NEXT: movw $-17, %ax ; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 ; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: movw $-33, %ax +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $10, %k7, %k7 +; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: movw $-65, %ax +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andl $1, %eax ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $14, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kmovw %k1, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k2, %k0, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k3 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k3, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k2 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k3, %k0, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 +; KNL_X32-NEXT: kmovw %k4, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k5, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k6, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: korw %k0, %k6, %k0 -; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k7, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: kandw %k7, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: andl $1, %eax +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k3, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k2, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: korw %k0, %k6, %k0 -; KNL_X32-NEXT: kandw %k1, %k0, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k3, %k0, %k0 -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k4 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k5 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k6 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k3, %k0, %k0 +; KNL_X32-NEXT: andl $1, %eax +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %k1, %k2 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kandw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: andl $1, %eax +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k3 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k3, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: korw %k0, %k7, %k0 -; KNL_X32-NEXT: kandw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 -; KNL_X32-NEXT: kshiftrw $13, %k7, %k7 -; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 -; KNL_X32-NEXT: kshiftrw $12, %k7, %k7 -; KNL_X32-NEXT: korw %k7, %k0, %k0 -; KNL_X32-NEXT: kandw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 -; KNL_X32-NEXT: kshiftrw $11, %k7, %k7 -; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: andl $1, %eax +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k2, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 -; KNL_X32-NEXT: kshiftrw $10, %k7, %k7 -; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 -; KNL_X32-NEXT: kshiftrw $9, %k7, %k7 -; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k6, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kandw %k7, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k4 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k4, %k0, %k0 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andl $1, %eax -; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx -; KNL_X32-NEXT: kmovw %ecx, %k7 -; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 -; KNL_X32-NEXT: kshiftrw $14, %k7, %k7 -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: korw %k7, %k6, %k6 -; KNL_X32-NEXT: kandw %k1, %k6, %k1 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: korw %k3, %k1, %k1 +; KNL_X32-NEXT: kandw %k2, %k1, %k1 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 -; KNL_X32-NEXT: korw %k6, %k1, %k1 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k3, %k1, %k1 +; KNL_X32-NEXT: kmovw (%esp), %k2 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k2, %k1, %k1 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k1, %k1 +; KNL_X32-NEXT: kandw %k5, %k1, %k1 +; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 -; KNL_X32-NEXT: kandw %k3, %k1, %k1 +; KNL_X32-NEXT: kandw %k6, %k1, %k1 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 -; KNL_X32-NEXT: kandw %k4, %k1, %k1 +; KNL_X32-NEXT: kandw %k7, %k1, %k1 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 -; KNL_X32-NEXT: kandw %k5, %k1, %k1 +; KNL_X32-NEXT: kandw %k4, %k1, %k1 ; KNL_X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 @@ -3127,7 +3231,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andb $127, %cl ; KNL_X32-NEXT: movb %cl, (%eax) -; KNL_X32-NEXT: addl $16, %esp +; KNL_X32-NEXT: addl $28, %esp ; KNL_X32-NEXT: popl %ebx ; KNL_X32-NEXT: retl $4 ; diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 76c87900b04d2..dce2d301a60cb 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -531,14 +531,16 @@ define <8 x i8> @f64to8uc(<8 x double> %f) { ; NOVL-LABEL: f64to8uc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdb %zmm0, %xmm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8uc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdb %ymm0, %xmm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptoui <8 x double> %f to <8 x i8> @@ -674,14 +676,16 @@ define <8 x i8> @f64to8sc(<8 x double> %f) { ; NOVL-LABEL: f64to8sc: ; NOVL: # %bb.0: ; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdb %zmm0, %xmm0 +; NOVL-NEXT: vpmovdw %zmm0, %ymm0 +; NOVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; NOVL-NEXT: vzeroupper ; NOVL-NEXT: retq ; ; VL-LABEL: f64to8sc: ; VL: # %bb.0: ; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdb %ymm0, %xmm0 +; VL-NEXT: vpmovdw %ymm0, %xmm0 +; VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; VL-NEXT: vzeroupper ; VL-NEXT: retq %res = fptosi <8 x double> %f to <8 x i8> diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 8aa898f3ec576..ff70826ae8060 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1381,10 +1381,13 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: kshiftrw $9, %k1, %k1 +; KNL-NEXT: kshiftrw $8, %k0, %k3 +; KNL-NEXT: kunpckbw %k0, %k3, %k0 ; KNL-NEXT: movw $-65, %ax ; KNL-NEXT: kmovw %eax, %k3 ; KNL-NEXT: kandw %k3, %k0, %k0 -; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: kshiftrw $9, %k1, %k1 ; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $9, %k0, %k0 ; KNL-NEXT: kshiftrw $9, %k0, %k0 @@ -1420,10 +1423,13 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $8, %k0, %k3 +; AVX512BW-NEXT: kunpckbw %k0, %k3, %k0 ; AVX512BW-NEXT: movw $-65, %ax ; AVX512BW-NEXT: kmovd %eax, %k3 ; AVX512BW-NEXT: kandw %k3, %k0, %k0 -; AVX512BW-NEXT: kshiftlw $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 +; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 ; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index b3bf464b529d0..23bb89f08c415 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1809,9 +1809,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1823,8 +1824,9 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -1837,9 +1839,10 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> @@ -1851,8 +1854,9 @@ define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64 define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_4xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2294,9 +2298,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,3] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2307,8 +2312,9 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64 define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,0,2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,3] +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -2323,9 +2329,10 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64 ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2337,8 +2344,9 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3] +; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3780,10 +3788,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %v ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3795,9 +3804,10 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %v define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,3] +; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3810,10 +3820,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %v ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] ; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[1,3,2,3] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> @@ -3825,9 +3836,10 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %v define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mask1: ; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,2,3] +; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512fp16-novl.ll b/llvm/test/CodeGen/X86/avx512fp16-novl.ll index d17cacc0e1ad7..5130cda6af594 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-novl.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-novl.ll @@ -133,6 +133,14 @@ define <8 x half> @select(<8 x half> %x) { ; CHECK-NEXT: seta %al ; CHECK-NEXT: andl $1, %eax ; CHECK-NEXT: kmovw %eax, %k0 +; CHECK-NEXT: kshiftrw $1, %k0, %k1 +; CHECK-NEXT: kshiftlw $1, %k1, %k1 +; CHECK-NEXT: korw %k0, %k1, %k0 +; CHECK-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-NEXT: kunpckbw %k0, %k1, %k0 +; CHECK-NEXT: movw $-3, %ax +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: kandw %k1, %k0, %k0 ; CHECK-NEXT: vpsrld $16, %xmm0, %xmm2 ; CHECK-NEXT: vucomish %xmm1, %xmm2 ; CHECK-NEXT: seta %al @@ -187,7 +195,8 @@ define <8 x half> @select(<8 x half> %x) { ; CHECK-NEXT: vucomish %xmm1, %xmm2 ; CHECK-NEXT: seta %al ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: kshiftlw $6, %k1, %k1 +; CHECK-NEXT: kshiftlw $15, %k1, %k1 +; CHECK-NEXT: kshiftrw $9, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kshiftlw $9, %k0, %k0 ; CHECK-NEXT: kshiftrw $9, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll index 4b0e5441b4abf..dd0572b119bd9 100644 --- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll +++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll @@ -235,19 +235,34 @@ define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) { ; SSE2-NEXT: movd %esi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4i32_partial: ; SSE41: # %bb.0: -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pinsrd $3, %esi, %xmm0 +; SSE41-NEXT: movd %edi, %xmm1 +; SSE41-NEXT: pinsrd $3, %esi, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] ; SSE41-NEXT: retq ; -; AVX-LABEL: test_buildvector_v4i32_partial: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_buildvector_v4i32_partial: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm0 +; AVX1-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_buildvector_v4i32_partial: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX2-NEXT: retq %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0 %ins1 = insertelement <4 x i32> %ins0, i32 undef, i32 1 %ins2 = insertelement <4 x i32> %ins1, i32 undef, i32 2 @@ -360,22 +375,34 @@ define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 } define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) { -; SSE-LABEL: test_buildvector_v8i16_partial: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $1, %edi, %xmm0 -; SSE-NEXT: pinsrw $3, %esi, %xmm0 -; SSE-NEXT: pinsrw $4, %edx, %xmm0 -; SSE-NEXT: pinsrw $5, %ecx, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_buildvector_v8i16_partial: +; SSE2: # %bb.0: +; SSE2-NEXT: pinsrw $1, %edi, %xmm0 +; SSE2-NEXT: pinsrw $3, %esi, %xmm0 +; SSE2-NEXT: pinsrw $4, %edx, %xmm0 +; SSE2-NEXT: pinsrw $5, %ecx, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_buildvector_v8i16_partial: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrw $1, %edi, %xmm0 +; SSE41-NEXT: pinsrw $3, %esi, %xmm0 +; SSE41-NEXT: pinsrw $4, %edx, %xmm0 +; SSE41-NEXT: pinsrw $5, %ecx, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3,4,5],xmm1[6,7] +; SSE41-NEXT: retq ; ; AVX-LABEL: test_buildvector_v8i16_partial: ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $3, %esi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $4, %edx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $5, %ecx, %xmm0, %xmm0 +; AVX-NEXT: vpinsrw $1, %edi, %xmm0, %xmm1 +; AVX-NEXT: vpinsrw $3, %esi, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $4, %edx, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5],xmm0[6,7] ; AVX-NEXT: retq %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0 %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1 @@ -552,28 +579,29 @@ define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11 ; SSE2-NEXT: pinsrw $6, %r8d, %xmm0 ; SSE2-NEXT: shll $8, %r9d ; SSE2-NEXT: pinsrw $7, %r9d, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v16i8_partial: ; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrb $2, %edi, %xmm0 ; SSE41-NEXT: pinsrb $6, %esi, %xmm0 ; SSE41-NEXT: pinsrb $8, %edx, %xmm0 ; SSE41-NEXT: pinsrb $11, %ecx, %xmm0 ; SSE41-NEXT: pinsrb $12, %r8d, %xmm0 ; SSE41-NEXT: pinsrb $15, %r9d, %xmm0 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: test_buildvector_v16i8_partial: ; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 ; AVX-NEXT: vpinsrb $6, %esi, %xmm0, %xmm0 ; AVX-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0 ; AVX-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; AVX-NEXT: vpinsrb $12, %r8d, %xmm0, %xmm0 ; AVX-NEXT: vpinsrb $15, %r9d, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %ins0 = insertelement <16 x i8> undef, i8 undef, i32 0 %ins1 = insertelement <16 x i8> %ins0, i8 undef, i32 1 diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll index 4f1c00b64fa98..2e04cb3ee8953 100644 --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -520,7 +520,10 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; XOP-NEXT: vmovq %rdi, %xmm2 ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 -; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6,7] +; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: bitselect_v4i64_broadcast_rrr: @@ -528,8 +531,9 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6 ; AVX1-NEXT: vmovq %rdi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -583,16 +587,28 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrm(<4 x i64> %a0, <4 x i64> %a1, pt ; XOP-LABEL: bitselect_v4i64_broadcast_rrm: ; XOP: # %bb.0: ; XOP-NEXT: vbroadcastsd (%rdi), %ymm2 -; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6,7] +; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v4i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm2 -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1 +; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v4i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm2 +; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i64_broadcast_rrm: ; AVX512F: # %bb.0: @@ -889,8 +905,13 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 ; XOP-NEXT: vmovq %rdi, %xmm4 ; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 -; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6,7] +; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1 +; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0 +; XOP-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: bitselect_v8i64_broadcast_rrr: @@ -898,11 +919,12 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6 ; AVX1-NEXT: vmovq %rdi, %xmm4 ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -958,20 +980,37 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrm(<8 x i64> %a0, <8 x i64> %a1, pt ; XOP-LABEL: bitselect_v8i64_broadcast_rrm: ; XOP: # %bb.0: ; XOP-NEXT: vbroadcastsd (%rdi), %ymm4 -; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0 -; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6,7] +; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1 +; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0 +; XOP-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1 +; XOP-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0 ; XOP-NEXT: retq ; -; AVX-LABEL: bitselect_v8i64_broadcast_rrm: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd (%rdi), %ymm4 -; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6,7] +; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitselect_v8i64_broadcast_rrm: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm4 +; AVX2-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vandnps %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vandnps %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: retq ; ; AVX512-LABEL: bitselect_v8i64_broadcast_rrm: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll index b8746626d6072..0253a683aff7f 100644 --- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll +++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll @@ -29,12 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16() { define <4 x i16> @test_sext_4i8_4i16_undef() { ; X86-LABEL: test_sext_4i8_4i16_undef: ; X86: # %bb.0: -; X86-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0] +; X86-NEXT: vpmovsxbw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0] +; X64-NEXT: vpmovsxbw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -65,12 +65,12 @@ define <4 x i32> @test_sext_4i8_4i32() { define <4 x i32> @test_sext_4i8_4i32_undef() { ; X86-LABEL: test_sext_4i8_4i32_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,0,4294967293] +; X86-NEXT: vpmovsxbd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,0,4294967293] +; X64-NEXT: vpmovsxbd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -101,12 +101,16 @@ define <4 x i64> @test_sext_4i8_4i64() { define <4 x i64> @test_sext_4i8_4i64_undef() { ; X86-LABEL: test_sext_4i8_4i64_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,4294967295,4294967295,0,0,4294967293,4294967295] +; X86-NEXT: vpmovsxbq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-NEXT: vpmovsxbq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_sext_4i8_4i64_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,18446744073709551615,0,18446744073709551613] +; X64-NEXT: vpmovsxbq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; X64-NEXT: vpmovsxbq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -163,12 +167,12 @@ define <8 x i32> @test_sext_8i8_8i32() { define <8 x i16> @test_sext_8i8_8i16_undef() { ; X86-LABEL: test_sext_8i8_8i16_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,0,65533,0,65531,0,65529] +; X86-NEXT: vpmovsxbw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: test_sext_8i8_8i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,65535,0,65533,0,65531,0,65529] +; X64-NEXT: vpmovsxbw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 undef, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -185,12 +189,16 @@ define <8 x i16> @test_sext_8i8_8i16_undef() { define <8 x i32> @test_sext_8i8_8i32_undef() { ; X86-LABEL: test_sext_8i8_8i32_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,2,0,4,0,6,0] +; X86-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,0,4,0,6,0,0,0,0,0,0,0,0,0] +; X86-NEXT: vpmovsxbd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_sext_8i8_8i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,2,0,4,0,6,0] +; X64-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,0,4,0,6,0,0,0,0,0,0,0,0,0] +; X64-NEXT: vpmovsxbd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 undef, i32 1 @@ -261,12 +269,12 @@ define <4 x i64> @test_zext_4i8_4i64() { define <4 x i16> @test_zext_4i8_4i16_undef() { ; X86-LABEL: test_zext_4i8_4i16_undef: ; X86: # %bb.0: -; X86-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0] +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X86-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0] +; X64-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -279,12 +287,12 @@ define <4 x i16> @test_zext_4i8_4i16_undef() { define <4 x i32> @test_zext_4i8_4i32_undef() { ; X86-LABEL: test_zext_4i8_4i32_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,2,0] +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = [0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0] ; X86-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,0,2,0] +; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = [0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0] ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 0, i32 0 %2 = insertelement <4 x i8> %1, i8 undef, i32 1 @@ -297,12 +305,16 @@ define <4 x i32> @test_zext_4i8_4i32_undef() { define <4 x i64> @test_zext_4i8_4i64_undef() { ; X86-LABEL: test_zext_4i8_4i64_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,255,0,2,0,0,0] +; X86-NEXT: vpmovzxbq {{.*#+}} xmm0 = [0,255,2,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X86-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_zext_4i8_4i64_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,255,2,0] +; X64-NEXT: vpmovzxbq {{.*#+}} xmm0 = [0,255,2,0,0,0,0,0,0,0,0,0,0,0,0,0] +; X64-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <4 x i8> undef, i8 undef, i32 0 %2 = insertelement <4 x i8> %1, i8 -1, i32 1 @@ -359,12 +371,12 @@ define <8 x i32> @test_zext_8i8_8i32() { define <8 x i16> @test_zext_8i8_8i16_undef() { ; X86-LABEL: test_zext_8i8_8i16_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253,0,251,0,249] +; X86-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X86-NEXT: retl ; ; X64-LABEL: test_zext_8i8_8i16_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [0,255,0,253,0,251,0,249] +; X64-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 undef, i32 0 %2 = insertelement <8 x i8> %1, i8 -1, i32 1 @@ -381,12 +393,16 @@ define <8 x i16> @test_zext_8i8_8i16_undef() { define <8 x i32> @test_zext_8i8_8i32_undef() { ; X86-LABEL: test_zext_8i8_8i32_undef: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,2,253,4,0,6,0] +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = [0,0,2,253,4,0,6,0,0,0,0,0,0,0,0,0] +; X86-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_zext_8i8_8i32_undef: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,2,253,4,0,6,0] +; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = [0,0,2,253,4,0,6,0,0,0,0,0,0,0,0,0] +; X64-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <8 x i8> undef, i8 0, i32 0 %2 = insertelement <8 x i8> %1, i8 undef, i32 1 diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll index 1a2cfd69650b8..b6a95fd199b42 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -1583,12 +1583,20 @@ define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind { ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: stest_f64i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: stest_f64i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: stest_f64i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1609,13 +1617,13 @@ define <2 x i8> @utest_f64i8(<2 x double> %x) nounwind { ; SSE-NEXT: cvttpd2dq %xmm0, %xmm3 ; SSE-NEXT: andpd %xmm2, %xmm3 ; SSE-NEXT: orpd %xmm1, %xmm3 -; SSE-NEXT: movapd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: xorpd %xmm3, %xmm0 -; SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: psrld $24, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: xorpd %xmm3, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903,2147483648,2147483648] +; SSE-NEXT: pcmpgtd %xmm1, %xmm0 +; SSE-NEXT: andpd %xmm0, %xmm3 +; SSE-NEXT: andnpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: orpd %xmm3, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq @@ -1628,8 +1636,7 @@ define <2 x i8> @utest_f64i8(<2 x double> %x) nounwind { ; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 ; AVX2-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vorpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX2-NEXT: retq ; @@ -1655,12 +1662,20 @@ define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind { ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ustest_f64i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: ustest_f64i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: ustest_f64i8: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, diff --git a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll index ec53704289d19..e9aef01696759 100644 --- a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll +++ b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll @@ -9,7 +9,9 @@ define <16 x i8> @fptoui_zext(<4 x float> %arg) { ; CHECK-LABEL: fptoui_zext: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: retq %f = fptoui <4 x float> %arg to <4 x i8> %z = zext <4 x i8> %f to <4 x i32> @@ -25,7 +27,9 @@ define <16 x i8> @fptoui_shuffle(<4 x float> %arg) { ; CHECK-LABEL: fptoui_shuffle: ; CHECK: # %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: retq %f = fptoui <4 x float> %arg to <4 x i8> %s = shufflevector <4 x i8> %f, <4 x i8> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll index e81a983c07018..0ffabedda77d1 100644 --- a/llvm/test/CodeGen/X86/masked_expandload.ll +++ b/llvm/test/CodeGen/X86/masked_expandload.ll @@ -1163,24 +1163,32 @@ define <2 x float> @expandload_v2f32_v2i1(ptr %base, <2 x float> %src0, <2 x i32 define <4 x float> @expandload_v4f32_const(ptr %base, <4 x float> %src0) { ; SSE2-LABEL: expandload_v4f32_const: ; SSE2: ## %bb.0: -; SSE2-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; SSE2-NEXT: movss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm2[0,3] +; SSE2-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 4(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 8(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[0,3] ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE42-LABEL: expandload_v4f32_const: ; SSE42: ## %bb.0: -; SSE42-NEXT: movsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; SSE42-NEXT: insertps $32, 8(%rdi), %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; SSE42-NEXT: movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 4(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 8(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; SSE42-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: expandload_v4f32_const: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovsd (%rdi), %xmm1 ## xmm1 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 8(%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0],xmm1[3] +; AVX1OR2-NEXT: vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 4(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 8(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX1OR2-NEXT: retq ; @@ -1214,40 +1222,94 @@ define <4 x float> @expandload_v4f32_const(ptr %base, <4 x float> %src0) { define <16 x float> @expandload_v16f32_const(ptr %base, <16 x float> %src0) { ; SSE2-LABEL: expandload_v16f32_const: ; SSE2: ## %bb.0: -; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm1 -; SSE2-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero -; SSE2-NEXT: movss 52(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero -; SSE2-NEXT: movss 40(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm6[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE2-NEXT: movaps %xmm5, %xmm2 -; SSE2-NEXT: movaps %xmm4, %xmm3 +; SSE2-NEXT: movss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 4(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 8(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 12(%rdi), %xmm8 ## xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 16(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 20(%rdi), %xmm9 ## xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 24(%rdi), %xmm10 ## xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 28(%rdi), %xmm11 ## xmm11 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 32(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 36(%rdi), %xmm12 ## xmm12 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 40(%rdi), %xmm13 ## xmm13 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 44(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 48(%rdi), %xmm14 ## xmm14 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 52(%rdi), %xmm15 ## xmm15 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm13[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm15[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,0] +; SSE2-NEXT: movaps %xmm4, %xmm2 +; SSE2-NEXT: movaps %xmm5, %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: expandload_v16f32_const: ; SSE42: ## %bb.0: -; SSE42-NEXT: movsd 44(%rdi), %xmm4 ## xmm4 = mem[0],zero -; SSE42-NEXT: insertps $32, 52(%rdi), %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3] -; SSE42-NEXT: movsd 32(%rdi), %xmm5 ## xmm5 = mem[0],zero -; SSE42-NEXT: insertps $32, 40(%rdi), %xmm5 ## xmm5 = xmm5[0,1],mem[0],xmm5[3] -; SSE42-NEXT: movups (%rdi), %xmm0 -; SSE42-NEXT: movups 16(%rdi), %xmm1 -; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] +; SSE42-NEXT: movss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 4(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 8(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 12(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 16(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 20(%rdi), %xmm8 ## xmm8 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 24(%rdi), %xmm9 ## xmm9 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 28(%rdi), %xmm10 ## xmm10 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 32(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 36(%rdi), %xmm11 ## xmm11 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 40(%rdi), %xmm12 ## xmm12 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 44(%rdi), %xmm13 ## xmm13 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 48(%rdi), %xmm14 ## xmm14 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 52(%rdi), %xmm15 ## xmm15 = mem[0],zero,zero,zero +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0],xmm0[3] +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[0] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0],xmm1[3] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm10[0] +; SSE42-NEXT: insertps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm13 = xmm13[0,1],xmm15[0],xmm13[3] +; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0],xmm4[3] +; SSE42-NEXT: blendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] +; SSE42-NEXT: blendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: expandload_v16f32_const: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovsd 44(%rdi), %xmm0 ## xmm0 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 52(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 +; AVX1OR2-NEXT: vmovss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 4(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 8(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 12(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 16(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 20(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 24(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 28(%rdi), %xmm8 ## xmm8 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 32(%rdi), %xmm9 ## xmm9 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 36(%rdi), %xmm10 ## xmm10 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 40(%rdi), %xmm11 ## xmm11 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 44(%rdi), %xmm12 ## xmm12 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 48(%rdi), %xmm13 ## xmm13 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 52(%rdi), %xmm14 ## xmm14 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[0] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0],xmm2[3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm11[0],xmm3[3] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX1OR2-NEXT: retq ; @@ -1278,29 +1340,78 @@ define <16 x float> @expandload_v16f32_const(ptr %base, <16 x float> %src0) { define <16 x float> @expandload_v16f32_const_undef(ptr %base) { ; SSE2-LABEL: expandload_v16f32_const_undef: ; SSE2: ## %bb.0: -; SSE2-NEXT: movss 40(%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm1 +; SSE2-NEXT: movss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 4(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 8(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 12(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 16(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 20(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 24(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 28(%rdi), %xmm8 ## xmm8 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 32(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 36(%rdi), %xmm9 ## xmm9 = mem[0],zero,zero,zero +; SSE2-NEXT: movss 40(%rdi), %xmm10 ## xmm10 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0] ; SSE2-NEXT: movups 44(%rdi), %xmm3 ; SSE2-NEXT: retq ; ; SSE42-LABEL: expandload_v16f32_const_undef: ; SSE42: ## %bb.0: -; SSE42-NEXT: movsd 32(%rdi), %xmm2 ## xmm2 = mem[0],zero -; SSE42-NEXT: insertps $32, 40(%rdi), %xmm2 ## xmm2 = xmm2[0,1],mem[0],xmm2[3] -; SSE42-NEXT: movups (%rdi), %xmm0 -; SSE42-NEXT: movups 16(%rdi), %xmm1 +; SSE42-NEXT: movss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 4(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 8(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 12(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 16(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 20(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 24(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 28(%rdi), %xmm8 ## xmm8 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 32(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 36(%rdi), %xmm9 ## xmm9 = mem[0],zero,zero,zero +; SSE42-NEXT: movss 40(%rdi), %xmm10 ## xmm10 = mem[0],zero,zero,zero +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] +; SSE42-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[0] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] +; SSE42-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0] +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[2,3] +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm10[0],xmm2[3] +; SSE42-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[0] ; SSE42-NEXT: movups 44(%rdi), %xmm3 ; SSE42-NEXT: retq ; ; AVX1OR2-LABEL: expandload_v16f32_const_undef: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vmovsd 32(%rdi), %xmm0 ## xmm0 = mem[0],zero -; AVX1OR2-NEXT: vinsertps $32, 40(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3] -; AVX1OR2-NEXT: vinsertf128 $1, 44(%rdi), %ymm0, %ymm1 -; AVX1OR2-NEXT: vmovups (%rdi), %ymm0 +; AVX1OR2-NEXT: vmovss (%rdi), %xmm0 ## xmm0 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 4(%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 8(%rdi), %xmm2 ## xmm2 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 12(%rdi), %xmm3 ## xmm3 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 16(%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 20(%rdi), %xmm5 ## xmm5 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 24(%rdi), %xmm6 ## xmm6 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 28(%rdi), %xmm7 ## xmm7 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 32(%rdi), %xmm8 ## xmm8 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 36(%rdi), %xmm9 ## xmm9 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vmovss 40(%rdi), %xmm10 ## xmm10 = mem[0],zero,zero,zero +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX1OR2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[2,3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm10[0],xmm1[3] +; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0] +; AVX1OR2-NEXT: vinsertf128 $1, 44(%rdi), %ymm1, %ymm1 ; AVX1OR2-NEXT: retq ; ; AVX512F-LABEL: expandload_v16f32_const_undef: diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 4e6f666fa05de..cbee1f4ae2896 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2227,8 +2227,16 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i ; X64-KNL-LABEL: test30: ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; X64-KNL-NEXT: movw $-3, %ax +; X64-KNL-NEXT: kmovw %eax, %k0 ; X64-KNL-NEXT: andl $1, %edi -; X64-KNL-NEXT: kmovw %edi, %k0 +; X64-KNL-NEXT: kmovw %edi, %k1 +; X64-KNL-NEXT: kshiftrw $1, %k0, %k2 +; X64-KNL-NEXT: kshiftlw $1, %k2, %k2 +; X64-KNL-NEXT: korw %k1, %k2, %k1 +; X64-KNL-NEXT: kshiftlw $4, %k0, %k2 +; X64-KNL-NEXT: korw %k1, %k2, %k1 +; X64-KNL-NEXT: kandw %k0, %k1, %k0 ; X64-KNL-NEXT: kmovw %esi, %k1 ; X64-KNL-NEXT: kshiftlw $15, %k1, %k1 ; X64-KNL-NEXT: kshiftrw $14, %k1, %k1 @@ -2256,9 +2264,17 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i ; X86-KNL-LABEL: test30: ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; X86-KNL-NEXT: movw $-3, %ax +; X86-KNL-NEXT: kmovw %eax, %k0 ; X86-KNL-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-KNL-NEXT: andl $1, %eax -; X86-KNL-NEXT: kmovw %eax, %k0 +; X86-KNL-NEXT: kmovw %eax, %k1 +; X86-KNL-NEXT: kshiftrw $1, %k0, %k2 +; X86-KNL-NEXT: kshiftlw $1, %k2, %k2 +; X86-KNL-NEXT: korw %k1, %k2, %k1 +; X86-KNL-NEXT: kshiftlw $4, %k0, %k2 +; X86-KNL-NEXT: korw %k1, %k2, %k1 +; X86-KNL-NEXT: kandw %k0, %k1, %k0 ; X86-KNL-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-KNL-NEXT: kmovw %eax, %k1 ; X86-KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -2286,13 +2302,21 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i ; ; X64-SKX-LABEL: test30: ; X64-SKX: # %bb.0: -; X64-SKX-NEXT: kmovw %esi, %k0 -; X64-SKX-NEXT: kshiftlb $7, %k0, %k0 -; X64-SKX-NEXT: kshiftrb $6, %k0, %k0 +; X64-SKX-NEXT: movb $-3, %al +; X64-SKX-NEXT: kmovw %eax, %k0 ; X64-SKX-NEXT: kmovw %edi, %k1 ; X64-SKX-NEXT: kshiftlb $7, %k1, %k1 ; X64-SKX-NEXT: kshiftrb $7, %k1, %k1 -; X64-SKX-NEXT: korw %k0, %k1, %k0 +; X64-SKX-NEXT: kshiftrb $1, %k0, %k2 +; X64-SKX-NEXT: kshiftlb $1, %k2, %k2 +; X64-SKX-NEXT: korw %k1, %k2, %k1 +; X64-SKX-NEXT: kshiftlb $4, %k0, %k2 +; X64-SKX-NEXT: korw %k1, %k2, %k1 +; X64-SKX-NEXT: kandw %k0, %k1, %k0 +; X64-SKX-NEXT: kmovw %esi, %k1 +; X64-SKX-NEXT: kshiftlb $7, %k1, %k1 +; X64-SKX-NEXT: kshiftrb $6, %k1, %k1 +; X64-SKX-NEXT: korw %k1, %k0, %k0 ; X64-SKX-NEXT: movb $-5, %al ; X64-SKX-NEXT: kmovw %eax, %k1 ; X64-SKX-NEXT: kandw %k1, %k0, %k0 @@ -2313,15 +2337,23 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i ; ; X86-SKX-LABEL: test30: ; X86-SKX: # %bb.0: -; X86-SKX-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SKX-NEXT: movb $-3, %al ; X86-SKX-NEXT: kmovw %eax, %k0 -; X86-SKX-NEXT: kshiftlb $7, %k0, %k0 -; X86-SKX-NEXT: kshiftrb $6, %k0, %k0 ; X86-SKX-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: kmovw %eax, %k1 ; X86-SKX-NEXT: kshiftlb $7, %k1, %k1 ; X86-SKX-NEXT: kshiftrb $7, %k1, %k1 -; X86-SKX-NEXT: korw %k0, %k1, %k0 +; X86-SKX-NEXT: kshiftrb $1, %k0, %k2 +; X86-SKX-NEXT: kshiftlb $1, %k2, %k2 +; X86-SKX-NEXT: korw %k1, %k2, %k1 +; X86-SKX-NEXT: kshiftlb $4, %k0, %k2 +; X86-SKX-NEXT: korw %k1, %k2, %k1 +; X86-SKX-NEXT: kandw %k0, %k1, %k0 +; X86-SKX-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SKX-NEXT: kmovw %eax, %k1 +; X86-SKX-NEXT: kshiftlb $7, %k1, %k1 +; X86-SKX-NEXT: kshiftrb $6, %k1, %k1 +; X86-SKX-NEXT: korw %k1, %k0, %k0 ; X86-SKX-NEXT: movb $-5, %al ; X86-SKX-NEXT: kmovw %eax, %k1 ; X86-SKX-NEXT: kandw %k1, %k0, %k0 @@ -2350,8 +2382,16 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> ; X64-KNL-LABEL: test30b: ; X64-KNL: # %bb.0: ; X64-KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 +; X64-KNL-NEXT: movw $-3, %ax +; X64-KNL-NEXT: kmovw %eax, %k0 ; X64-KNL-NEXT: andl $1, %edi -; X64-KNL-NEXT: kmovw %edi, %k0 +; X64-KNL-NEXT: kmovw %edi, %k1 +; X64-KNL-NEXT: kshiftrw $1, %k0, %k2 +; X64-KNL-NEXT: kshiftlw $1, %k2, %k2 +; X64-KNL-NEXT: korw %k1, %k2, %k1 +; X64-KNL-NEXT: kshiftlw $4, %k0, %k2 +; X64-KNL-NEXT: korw %k1, %k2, %k1 +; X64-KNL-NEXT: kandw %k0, %k1, %k0 ; X64-KNL-NEXT: kmovw %esi, %k1 ; X64-KNL-NEXT: kshiftlw $15, %k1, %k1 ; X64-KNL-NEXT: kshiftrw $14, %k1, %k1 @@ -2378,9 +2418,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> ; X86-KNL-LABEL: test30b: ; X86-KNL: # %bb.0: ; X86-KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; X86-KNL-NEXT: movw $-3, %ax +; X86-KNL-NEXT: kmovw %eax, %k0 ; X86-KNL-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-KNL-NEXT: andl $1, %eax -; X86-KNL-NEXT: kmovw %eax, %k0 +; X86-KNL-NEXT: kmovw %eax, %k1 +; X86-KNL-NEXT: kshiftrw $1, %k0, %k2 +; X86-KNL-NEXT: kshiftlw $1, %k2, %k2 +; X86-KNL-NEXT: korw %k1, %k2, %k1 +; X86-KNL-NEXT: kshiftlw $4, %k0, %k2 +; X86-KNL-NEXT: korw %k1, %k2, %k1 +; X86-KNL-NEXT: kandw %k0, %k1, %k0 ; X86-KNL-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-KNL-NEXT: kmovw %eax, %k1 ; X86-KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -2407,13 +2455,21 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> ; ; X64-SKX-LABEL: test30b: ; X64-SKX: # %bb.0: -; X64-SKX-NEXT: kmovw %esi, %k0 -; X64-SKX-NEXT: kshiftlb $7, %k0, %k0 -; X64-SKX-NEXT: kshiftrb $6, %k0, %k0 +; X64-SKX-NEXT: movb $-3, %al +; X64-SKX-NEXT: kmovw %eax, %k0 ; X64-SKX-NEXT: kmovw %edi, %k1 ; X64-SKX-NEXT: kshiftlb $7, %k1, %k1 ; X64-SKX-NEXT: kshiftrb $7, %k1, %k1 -; X64-SKX-NEXT: korw %k0, %k1, %k0 +; X64-SKX-NEXT: kshiftrb $1, %k0, %k2 +; X64-SKX-NEXT: kshiftlb $1, %k2, %k2 +; X64-SKX-NEXT: korw %k1, %k2, %k1 +; X64-SKX-NEXT: kshiftlb $4, %k0, %k2 +; X64-SKX-NEXT: korw %k1, %k2, %k1 +; X64-SKX-NEXT: kandw %k0, %k1, %k0 +; X64-SKX-NEXT: kmovw %esi, %k1 +; X64-SKX-NEXT: kshiftlb $7, %k1, %k1 +; X64-SKX-NEXT: kshiftrb $6, %k1, %k1 +; X64-SKX-NEXT: korw %k1, %k0, %k0 ; X64-SKX-NEXT: movb $-5, %al ; X64-SKX-NEXT: kmovw %eax, %k1 ; X64-SKX-NEXT: kandw %k1, %k0, %k0 @@ -2433,15 +2489,23 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> ; ; X86-SKX-LABEL: test30b: ; X86-SKX: # %bb.0: -; X86-SKX-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SKX-NEXT: movb $-3, %al ; X86-SKX-NEXT: kmovw %eax, %k0 -; X86-SKX-NEXT: kshiftlb $7, %k0, %k0 -; X86-SKX-NEXT: kshiftrb $6, %k0, %k0 ; X86-SKX-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-SKX-NEXT: kmovw %eax, %k1 ; X86-SKX-NEXT: kshiftlb $7, %k1, %k1 ; X86-SKX-NEXT: kshiftrb $7, %k1, %k1 -; X86-SKX-NEXT: korw %k0, %k1, %k0 +; X86-SKX-NEXT: kshiftrb $1, %k0, %k2 +; X86-SKX-NEXT: kshiftlb $1, %k2, %k2 +; X86-SKX-NEXT: korw %k1, %k2, %k1 +; X86-SKX-NEXT: kshiftlb $4, %k0, %k2 +; X86-SKX-NEXT: korw %k1, %k2, %k1 +; X86-SKX-NEXT: kandw %k0, %k1, %k0 +; X86-SKX-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-SKX-NEXT: kmovw %eax, %k1 +; X86-SKX-NEXT: kshiftlb $7, %k1, %k1 +; X86-SKX-NEXT: kshiftrb $6, %k1, %k1 +; X86-SKX-NEXT: korw %k1, %k0, %k0 ; X86-SKX-NEXT: movb $-5, %al ; X86-SKX-NEXT: kmovw %eax, %k1 ; X86-SKX-NEXT: kandw %k1, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll index c7320275091c6..8c5cf4af7b790 100644 --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -5356,8 +5356,16 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind { ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: movw $-3, %ax +; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kmovw %esi, %k1 +; AVX512F-NEXT: kshiftrw $1, %k0, %k2 +; AVX512F-NEXT: kshiftlw $1, %k2, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kandw %k0, %k1, %k0 ; AVX512F-NEXT: kmovw %edx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 @@ -5380,13 +5388,21 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind { ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: kmovw %edx, %k0 -; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 +; AVX512VLDQ-NEXT: movb $-3, %al +; AVX512VLDQ-NEXT: kmovw %eax, %k0 ; AVX512VLDQ-NEXT: kmovw %esi, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 -; AVX512VLDQ-NEXT: korw %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k2 +; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2 +; AVX512VLDQ-NEXT: korw %k1, %k2, %k1 +; AVX512VLDQ-NEXT: kshiftlb $4, %k0, %k2 +; AVX512VLDQ-NEXT: korw %k1, %k2, %k1 +; AVX512VLDQ-NEXT: kandw %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kmovw %edx, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftrb $6, %k1, %k1 +; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 ; AVX512VLDQ-NEXT: movb $-5, %al ; AVX512VLDQ-NEXT: kmovw %eax, %k1 ; AVX512VLDQ-NEXT: kandw %k1, %k0, %k0 @@ -5402,8 +5418,16 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind { ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: +; AVX512VLBW-NEXT: movw $-3, %ax +; AVX512VLBW-NEXT: kmovd %eax, %k0 ; AVX512VLBW-NEXT: andl $1, %esi -; AVX512VLBW-NEXT: kmovw %esi, %k0 +; AVX512VLBW-NEXT: kmovw %esi, %k1 +; AVX512VLBW-NEXT: kshiftrw $1, %k0, %k2 +; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512VLBW-NEXT: korw %k1, %k2, %k1 +; AVX512VLBW-NEXT: kshiftlw $4, %k0, %k2 +; AVX512VLBW-NEXT: korw %k1, %k2, %k1 +; AVX512VLBW-NEXT: kandw %k0, %k1, %k0 ; AVX512VLBW-NEXT: kmovd %edx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $14, %k1, %k1 @@ -5423,13 +5447,21 @@ define void @widen_masked_store(<3 x i32> %v, ptr %p, <3 x i1> %mask) nounwind { ; ; X86-AVX512-LABEL: widen_masked_store: ; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k0 -; X86-AVX512-NEXT: kshiftlb $7, %k0, %k0 -; X86-AVX512-NEXT: kshiftrb $6, %k0, %k0 +; X86-AVX512-NEXT: movb $-3, %al +; X86-AVX512-NEXT: kmovd %eax, %k0 ; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 ; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 ; X86-AVX512-NEXT: kshiftrb $7, %k1, %k1 -; X86-AVX512-NEXT: korw %k0, %k1, %k0 +; X86-AVX512-NEXT: kshiftrb $1, %k0, %k2 +; X86-AVX512-NEXT: kshiftlb $1, %k2, %k2 +; X86-AVX512-NEXT: korw %k1, %k2, %k1 +; X86-AVX512-NEXT: kshiftlb $4, %k0, %k2 +; X86-AVX512-NEXT: korw %k1, %k2, %k1 +; X86-AVX512-NEXT: kandw %k0, %k1, %k0 +; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1 +; X86-AVX512-NEXT: kshiftrb $6, %k1, %k1 +; X86-AVX512-NEXT: korw %k1, %k0, %k0 ; X86-AVX512-NEXT: movb $-5, %al ; X86-AVX512-NEXT: kmovd %eax, %k1 ; X86-AVX512-NEXT: kandw %k1, %k0, %k0 @@ -5933,19 +5965,19 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] ; AVX2-NEXT: vpcmpgtd %ymm5, %ymm3, %ymm3 ; AVX2-NEXT: vpacksswb %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm3, %ymm3 -; AVX2-NEXT: vpmaskmovd %ymm0, %ymm3, (%rdx) -; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx) -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx) +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpslld $31, %ymm4, %ymm4 +; AVX2-NEXT: vpmaskmovd %ymm2, %ymm4, 64(%rdx) +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; AVX2-NEXT: vpslld $31, %ymm2, %ymm2 +; AVX2-NEXT: vpmaskmovd %ymm1, %ymm2, 32(%rdx) +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpslld $31, %ymm1, %ymm1 +; AVX2-NEXT: vpmaskmovd %ymm0, %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -6028,12 +6060,13 @@ define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind { ; SSE2-LABEL: undefshuffle: ; SSE2: ## %bb.0: ## %else ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 -; SSE2-NEXT: pinsrw $2, -{{[0-9]+}}(%rsp), %xmm0 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; SSE2-NEXT: movzwl -{{[0-9]+}}(%rsp), %edx +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE2-NEXT: pinsrw $2, %eax, %xmm0 +; SSE2-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: pinsrw $3, %eax, %xmm0 ; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 @@ -6097,10 +6130,17 @@ define void @undefshuffle(<8 x i1> %i0, ptr %src, ptr %dst) nounwind { ; ; SSE4-LABEL: undefshuffle: ; SSE4: ## %bb.0: ## %else -; SSE4-NEXT: psllw $15, %xmm0 -; SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: packsswb %xmm0, %xmm0 -; SSE4-NEXT: pmovmskb %xmm0, %eax +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE4-NEXT: pextrb $2, %xmm0, %eax +; SSE4-NEXT: pinsrw $1, %eax, %xmm1 +; SSE4-NEXT: pextrb $4, %xmm0, %eax +; SSE4-NEXT: pinsrw $2, %eax, %xmm1 +; SSE4-NEXT: pextrb $6, %xmm0, %eax +; SSE4-NEXT: pinsrw $3, %eax, %xmm1 +; SSE4-NEXT: psllw $15, %xmm1 +; SSE4-NEXT: packsswb %xmm1, %xmm1 +; SSE4-NEXT: pmovmskb %xmm1, %eax ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne LBB32_1 ; SSE4-NEXT: ## %bb.2: ## %else23 diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 388d8528a2b80..8e308abe4a000 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -658,12 +658,12 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX1-NEXT: vaddsd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0 ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1 -; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1 +; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[1],ymm1[0],ymm3[2],ymm1[3] ; AVX1-NEXT: vmovsd %xmm2, 64(%rdi) -; AVX1-NEXT: vmovapd %ymm1, 32(%rdi) ; AVX1-NEXT: vmovapd %ymm0, (%rdi) +; AVX1-NEXT: vmovapd %ymm1, 32(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -716,12 +716,12 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin ; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2 ; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3] -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3] +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm3 +; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[2],ymm0[3] ; AVX2-NEXT: vmovsd %xmm1, 64(%rdi) -; AVX2-NEXT: vmovapd %ymm0, 32(%rdi) ; AVX2-NEXT: vmovapd %ymm2, (%rdi) +; AVX2-NEXT: vmovapd %ymm0, 32(%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll index 595f8491b405c..1df4e9f47f21b 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -262,37 +262,54 @@ define <4 x float> @merge_4f32_f32_45zz(ptr %ptr) nounwind uwtable noinline ssp define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_4f32_f32_012u: ; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_012u: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_012u: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X86-SSE1-NEXT: retl ; ; X86-SSE41-LABEL: merge_4f32_f32_012u: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; X86-SSE41-NEXT: retl %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 2 @@ -309,37 +326,54 @@ define <4 x float> @merge_4f32_f32_012u(ptr %ptr) nounwind uwtable noinline ssp define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_4f32_f32_019u: ; SSE2: # %bb.0: +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: ; SSE41: # %bb.0: -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_019u: ; AVX: # %bb.0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX-NEXT: retq ; ; X86-SSE1-LABEL: merge_4f32_f32_019u: ; X86-SSE1: # %bb.0: ; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE1-NEXT: xorps %xmm0, %xmm0 -; X86-SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] +; X86-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; X86-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X86-SSE1-NEXT: retl ; ; X86-SSE41-LABEL: merge_4f32_f32_019u: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X86-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; X86-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; X86-SSE41-NEXT: retl %ptr1 = getelementptr inbounds float, ptr %ptr, i64 1 %ptr2 = getelementptr inbounds float, ptr %ptr, i64 9 diff --git a/llvm/test/CodeGen/X86/mmx-build-vector.ll b/llvm/test/CodeGen/X86/mmx-build-vector.ll index d8a010bacc683..e3b2dcc8728f8 100644 --- a/llvm/test/CodeGen/X86/mmx-build-vector.ll +++ b/llvm/test/CodeGen/X86/mmx-build-vector.ll @@ -65,30 +65,23 @@ define void @build_v2i32_0z(ptr%p0, i32 %a0, i32 %a1) nounwind { } define void @build_v2i32_u1(ptr%p0, i32 %a0, i32 %a1) nounwind { -; X86-MMX-LABEL: build_v2i32_u1: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0] -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v2i32_u1: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: retl +; X86-LABEL: build_v2i32_u1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: punpckldq %mm1, %mm0 # mm0 = mm0[0],mm1[0] +; X86-NEXT: paddd %mm0, %mm0 +; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: retl ; ; X64-LABEL: build_v2i32_u1: ; X64: # %bb.0: ; X64-NEXT: movd %edx, %mm0 -; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: movd %eax, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) ; X64-NEXT: retq %1 = insertelement <2 x i32> undef, i32 undef, i32 0 %2 = insertelement <2 x i32> %1, i32 %a1, i32 1 @@ -238,17 +231,28 @@ define void @build_v4i16_01zz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { ; X86-LABEL: build_v4i16_0uuz: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: movd %eax, %mm1 +; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X86-NEXT: pxor %mm2, %mm2 +; X86-NEXT: punpcklwd %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1] +; X86-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-NEXT: paddd %mm1, %mm1 +; X86-NEXT: movq %mm1, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: build_v4i16_0uuz: ; X64: # %bb.0: -; X64-NEXT: movd %esi, %mm0 -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: movd %eax, %mm0 +; X64-NEXT: pxor %mm1, %mm1 +; X64-NEXT: movd %esi, %mm2 +; X64-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] +; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X64-NEXT: paddd %mm2, %mm2 +; X64-NEXT: movq %mm2, (%rdi) ; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 undef, i32 1 @@ -261,22 +265,33 @@ define void @build_v4i16_0uuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi } define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { -; X86-LABEL: build_v4i16_0zuz: -; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd %eax, %mm0 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: retl +; X86-MMX-LABEL: build_v4i16_0zuz: +; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %ebp +; X86-MMX-NEXT: movl %esp, %ebp +; X86-MMX-NEXT: andl $-8, %esp +; X86-MMX-NEXT: subl $8, %esp +; X86-MMX-NEXT: movl 8(%ebp), %eax +; X86-MMX-NEXT: movzwl 12(%ebp), %ecx +; X86-MMX-NEXT: movzwl %ax, %edx +; X86-MMX-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-MMX-NEXT: movl %ecx, (%esp) +; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: paddd %mm0, %mm0 +; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: movl %ebp, %esp +; X86-MMX-NEXT: popl %ebp +; X86-MMX-NEXT: retl ; -; X64-LABEL: build_v4i16_0zuz: -; X64: # %bb.0: -; X64-NEXT: movzwl %si, %eax -; X64-NEXT: movd %eax, %mm0 -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: build_v4i16_0zuz: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 0, i32 1 %3 = insertelement <4 x i16> %2, i16 undef, i32 2 @@ -290,27 +305,34 @@ define void @build_v4i16_0zuz(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { ; X86-LABEL: build_v4i16_012u: ; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 -; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] +; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: movd %esi, %mm1 +; X86-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X86-NEXT: movd %edx, %mm0 +; X86-NEXT: movd %ecx, %mm2 +; X86-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X86-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0] ; X86-NEXT: paddd %mm2, %mm2 ; X86-NEXT: movq %mm2, (%eax) +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: build_v4i16_012u: ; X64: # %bb.0: -; X64-NEXT: movd %ecx, %mm0 -; X64-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] -; X64-NEXT: movd %edx, %mm1 -; X64-NEXT: movd %esi, %mm2 -; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X64-NEXT: paddd %mm2, %mm2 -; X64-NEXT: movq %mm2, (%rdi) +; X64-NEXT: movd %edx, %mm0 +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: punpcklwd %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1] +; X64-NEXT: movd %eax, %mm0 +; X64-NEXT: movd %ecx, %mm2 +; X64-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X64-NEXT: punpckldq %mm2, %mm1 # mm1 = mm1[0],mm2[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) ; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 %a1, i32 1 @@ -323,31 +345,30 @@ define void @build_v4i16_012u(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwi } define void @build_v4i16_0u00(ptr%p0, i16 %a0, i16 %a1, i16 %a2, i16 %a3) nounwind { -; X86-MMX-LABEL: build_v4i16_0u00: -; X86-MMX: # %bb.0: -; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-MMX-NEXT: punpcklwd %mm0, %mm0 # mm0 = mm0[0,0,1,1] -; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0] -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) -; X86-MMX-NEXT: retl -; -; X86-SSE-LABEL: build_v4i16_0u00: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-SSE-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) -; X86-SSE-NEXT: retl +; X86-LABEL: build_v4i16_0u00: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movd %eax, %mm0 +; X86-NEXT: movd %eax, %mm1 +; X86-NEXT: movq %mm1, %mm2 +; X86-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X86-NEXT: punpcklwd %mm1, %mm1 # mm1 = mm1[0,0,1,1] +; X86-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0] +; X86-NEXT: paddd %mm2, %mm2 +; X86-NEXT: movq %mm2, (%ecx) +; X86-NEXT: retl ; ; X64-LABEL: build_v4i16_0u00: ; X64: # %bb.0: -; X64-NEXT: movd %esi, %mm0 -; X64-NEXT: pshufw $0, %mm0, %mm0 # mm0 = mm0[0,0,0,0] -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: movd %eax, %mm0 +; X64-NEXT: movd %esi, %mm1 +; X64-NEXT: movq %mm1, %mm2 +; X64-NEXT: punpcklwd %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1] +; X64-NEXT: punpcklwd %mm1, %mm1 # mm1 = mm1[0,0,1,1] +; X64-NEXT: punpckldq %mm1, %mm2 # mm2 = mm2[0],mm1[0] +; X64-NEXT: paddd %mm2, %mm2 +; X64-NEXT: movq %mm2, (%rdi) ; X64-NEXT: retq %1 = insertelement <4 x i16> undef, i16 %a0, i32 0 %2 = insertelement <4 x i16> %1, i16 undef, i32 1 @@ -427,19 +448,22 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: pxor %mm1, %mm1 ; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 -; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] -; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] ; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] +; X86-NEXT: punpcklbw %mm2, %mm0 # mm0 = mm0[0],mm2[0],mm0[1],mm2[1],mm0[2],mm2[2],mm0[3],mm2[3] ; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X86-NEXT: punpckldq %mm2, %mm0 # mm0 = mm0[0],mm2[0] -; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%eax) +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 +; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 +; X86-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3] +; X86-NEXT: movzbl %al, %ecx +; X86-NEXT: movd %ecx, %mm1 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movd %ecx, %mm3 +; X86-NEXT: punpcklbw %mm1, %mm3 # mm3 = mm3[0],mm1[0],mm3[1],mm1[1],mm3[2],mm1[2],mm3[3],mm1[3] +; X86-NEXT: punpcklwd %mm2, %mm3 # mm3 = mm3[0],mm2[0],mm3[1],mm2[1] +; X86-NEXT: punpckldq %mm0, %mm3 # mm3 = mm3[0],mm0[0] +; X86-NEXT: paddd %mm3, %mm3 +; X86-NEXT: movq %mm3, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: build_v8i8_0u2345z7: @@ -455,7 +479,9 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; X64-NEXT: movd %ecx, %mm2 ; X64-NEXT: punpcklbw %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1],mm2[2],mm1[2],mm2[3],mm1[3] ; X64-NEXT: movd %esi, %mm1 -; X64-NEXT: punpcklbw %mm1, %mm1 # mm1 = mm1[0,0,1,1,2,2,3,3] +; X64-NEXT: movzbl %al, %eax +; X64-NEXT: movd %eax, %mm3 +; X64-NEXT: punpcklbw %mm3, %mm1 # mm1 = mm1[0],mm3[0],mm1[1],mm3[1],mm1[2],mm3[2],mm1[3],mm3[3] ; X64-NEXT: punpcklwd %mm2, %mm1 # mm1 = mm1[0],mm2[0],mm1[1],mm2[1] ; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] ; X64-NEXT: paddd %mm1, %mm1 @@ -476,44 +502,55 @@ define void @build_v8i8_0u2345z7(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, } define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-LABEL: build_v8i8_0123zzzu: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm1 -; X86-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm2 -; X86-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] -; X86-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X86-NEXT: pxor %mm0, %mm0 -; X86-NEXT: pxor %mm1, %mm1 -; X86-NEXT: punpcklbw %mm1, %mm1 # mm1 = mm1[0,0,1,1,2,2,3,3] -; X86-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] -; X86-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X86-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X86-NEXT: paddd %mm2, %mm2 -; X86-NEXT: movq %mm2, (%eax) -; X86-NEXT: retl +; X86-MMX-LABEL: build_v8i8_0123zzzu: +; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %ebp +; X86-MMX-NEXT: movl %esp, %ebp +; X86-MMX-NEXT: pushl %esi +; X86-MMX-NEXT: andl $-8, %esp +; X86-MMX-NEXT: subl $16, %esp +; X86-MMX-NEXT: movl 8(%ebp), %eax +; X86-MMX-NEXT: movzbl 20(%ebp), %edx +; X86-MMX-NEXT: movzbl 24(%ebp), %ecx +; X86-MMX-NEXT: shll $8, %ecx +; X86-MMX-NEXT: orl %edx, %ecx +; X86-MMX-NEXT: shll $16, %ecx +; X86-MMX-NEXT: movzbl 12(%ebp), %edx +; X86-MMX-NEXT: movzbl 16(%ebp), %esi +; X86-MMX-NEXT: shll $8, %esi +; X86-MMX-NEXT: orl %edx, %esi +; X86-MMX-NEXT: movzwl %si, %edx +; X86-MMX-NEXT: orl %ecx, %edx +; X86-MMX-NEXT: movzbl %al, %ecx +; X86-MMX-NEXT: shll $24, %ecx +; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-MMX-NEXT: movl %edx, (%esp) +; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: paddd %mm0, %mm0 +; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: leal -4(%ebp), %esp +; X86-MMX-NEXT: popl %esi +; X86-MMX-NEXT: popl %ebp +; X86-MMX-NEXT: retl ; -; X64-LABEL: build_v8i8_0123zzzu: -; X64: # %bb.0: -; X64-NEXT: movd %r8d, %mm0 -; X64-NEXT: movd %ecx, %mm1 -; X64-NEXT: punpcklbw %mm0, %mm1 # mm1 = mm1[0],mm0[0],mm1[1],mm0[1],mm1[2],mm0[2],mm1[3],mm0[3] -; X64-NEXT: movd %edx, %mm0 -; X64-NEXT: movd %esi, %mm2 -; X64-NEXT: punpcklbw %mm0, %mm2 # mm2 = mm2[0],mm0[0],mm2[1],mm0[1],mm2[2],mm0[2],mm2[3],mm0[3] -; X64-NEXT: punpcklwd %mm1, %mm2 # mm2 = mm2[0],mm1[0],mm2[1],mm1[1] -; X64-NEXT: pxor %mm0, %mm0 -; X64-NEXT: pxor %mm1, %mm1 -; X64-NEXT: punpcklbw %mm1, %mm1 # mm1 = mm1[0,0,1,1,2,2,3,3] -; X64-NEXT: punpcklbw %mm0, %mm0 # mm0 = mm0[0,0,1,1,2,2,3,3] -; X64-NEXT: punpcklwd %mm1, %mm0 # mm0 = mm0[0],mm1[0],mm0[1],mm1[1] -; X64-NEXT: punpckldq %mm0, %mm2 # mm2 = mm2[0],mm0[0] -; X64-NEXT: paddd %mm2, %mm2 -; X64-NEXT: movq %mm2, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0123zzzu: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: shll $8, %edx +; X86-SSE-NEXT: orl %ecx, %edx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: shll $16, %ecx +; X86-SSE-NEXT: orl %edx, %ecx +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-SSE-NEXT: shll $24, %edx +; X86-SSE-NEXT: orl %ecx, %edx +; X86-SSE-NEXT: movd %edx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 %a1, i32 1 %3 = insertelement <8 x i8> %2, i8 %a2, i32 2 @@ -529,20 +566,45 @@ define void @build_v8i8_0123zzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, } define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-LABEL: build_v8i8_0uuuuzzz: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: retl +; X86-MMX-LABEL: build_v8i8_0uuuuzzz: +; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %ebp +; X86-MMX-NEXT: movl %esp, %ebp +; X86-MMX-NEXT: pushl %edi +; X86-MMX-NEXT: pushl %esi +; X86-MMX-NEXT: andl $-8, %esp +; X86-MMX-NEXT: subl $8, %esp +; X86-MMX-NEXT: movl 8(%ebp), %eax +; X86-MMX-NEXT: movzbl %al, %ecx +; X86-MMX-NEXT: movl %ecx, %edx +; X86-MMX-NEXT: shll $8, %edx +; X86-MMX-NEXT: leal (%ecx,%edx), %esi +; X86-MMX-NEXT: shll $16, %esi +; X86-MMX-NEXT: movzbl 12(%ebp), %edi +; X86-MMX-NEXT: orl %edx, %edi +; X86-MMX-NEXT: movzwl %di, %edx +; X86-MMX-NEXT: orl %esi, %edx +; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-MMX-NEXT: movl %edx, (%esp) +; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: paddd %mm0, %mm0 +; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: leal -8(%ebp), %esp +; X86-MMX-NEXT: popl %esi +; X86-MMX-NEXT: popl %edi +; X86-MMX-NEXT: popl %ebp +; X86-MMX-NEXT: retl ; -; X64-LABEL: build_v8i8_0uuuuzzz: -; X64: # %bb.0: -; X64-NEXT: movd %esi, %mm0 -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0uuuuzzz: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 undef, i32 1 %3 = insertelement <8 x i8> %2, i8 undef, i32 2 @@ -558,22 +620,34 @@ define void @build_v8i8_0uuuuzzz(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, } define void @build_v8i8_0zzzzzzu(ptr%p0, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7) nounwind { -; X86-LABEL: build_v8i8_0zzzzzzu: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movd %eax, %mm0 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: paddd %mm0, %mm0 -; X86-NEXT: movq %mm0, (%eax) -; X86-NEXT: retl +; X86-MMX-LABEL: build_v8i8_0zzzzzzu: +; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %ebp +; X86-MMX-NEXT: movl %esp, %ebp +; X86-MMX-NEXT: andl $-8, %esp +; X86-MMX-NEXT: subl $8, %esp +; X86-MMX-NEXT: movl 8(%ebp), %eax +; X86-MMX-NEXT: movzbl 12(%ebp), %ecx +; X86-MMX-NEXT: movl %ecx, (%esp) +; X86-MMX-NEXT: movzbl %al, %ecx +; X86-MMX-NEXT: shll $24, %ecx +; X86-MMX-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-MMX-NEXT: movq (%esp), %mm0 +; X86-MMX-NEXT: paddd %mm0, %mm0 +; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: movl %ebp, %esp +; X86-MMX-NEXT: popl %ebp +; X86-MMX-NEXT: retl ; -; X64-LABEL: build_v8i8_0zzzzzzu: -; X64: # %bb.0: -; X64-NEXT: movzbl %sil, %eax -; X64-NEXT: movd %eax, %mm0 -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) -; X64-NEXT: retq +; X86-SSE-LABEL: build_v8i8_0zzzzzzu: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: movd %ecx, %xmm0 +; X86-SSE-NEXT: movdq2q %xmm0, %mm0 +; X86-SSE-NEXT: paddd %mm0, %mm0 +; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: retl %1 = insertelement <8 x i8> undef, i8 %a0, i32 0 %2 = insertelement <8 x i8> %1, i8 0, i32 1 %3 = insertelement <8 x i8> %2, i8 0, i32 2 @@ -716,11 +790,14 @@ define void @build_v2f32_0z(ptr%p0, float %a0, float %a1) nounwind { define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind { ; X86-MMX-LABEL: build_v2f32_u1: ; X86-MMX: # %bb.0: +; X86-MMX-NEXT: pushl %eax ; X86-MMX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-MMX-NEXT: movd {{[0-9]+}}(%esp), %mm0 -; X86-MMX-NEXT: punpckldq %mm0, %mm0 # mm0 = mm0[0,0] -; X86-MMX-NEXT: paddd %mm0, %mm0 -; X86-MMX-NEXT: movq %mm0, (%eax) +; X86-MMX-NEXT: pxor %mm1, %mm1 +; X86-MMX-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-MMX-NEXT: paddd %mm1, %mm1 +; X86-MMX-NEXT: movq %mm1, (%eax) +; X86-MMX-NEXT: popl %eax ; X86-MMX-NEXT: retl ; ; X86-SSE-LABEL: build_v2f32_u1: @@ -728,17 +805,19 @@ define void @build_v2f32_u1(ptr%p0, float %a0, float %a1) nounwind { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE-NEXT: movdq2q %xmm0, %mm0 -; X86-SSE-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] -; X86-SSE-NEXT: paddd %mm0, %mm0 -; X86-SSE-NEXT: movq %mm0, (%eax) +; X86-SSE-NEXT: movdq2q %xmm0, %mm1 +; X86-SSE-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X86-SSE-NEXT: paddd %mm1, %mm1 +; X86-SSE-NEXT: movq %mm1, (%eax) ; X86-SSE-NEXT: retl ; ; X64-LABEL: build_v2f32_u1: ; X64: # %bb.0: ; X64-NEXT: movdq2q %xmm1, %mm0 -; X64-NEXT: pshufw $68, %mm0, %mm0 # mm0 = mm0[0,1,0,1] -; X64-NEXT: paddd %mm0, %mm0 -; X64-NEXT: movq %mm0, (%rdi) +; X64-NEXT: movdq2q %xmm0, %mm1 +; X64-NEXT: punpckldq %mm0, %mm1 # mm1 = mm1[0],mm0[0] +; X64-NEXT: paddd %mm1, %mm1 +; X64-NEXT: movq %mm1, (%rdi) ; X64-NEXT: retq %1 = insertelement <2 x float> undef, float undef, i32 0 %2 = insertelement <2 x float> %1, float %a1, i32 1 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 4b0f75df83a76..4d8ddd1bb1132 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1920,35 +1920,35 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) { ; ; AVX1-LABEL: splat3_128: ; AVX1: # %bb.0: -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi) -; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 64(%rdi) ; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) -; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm0, (%rdi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm5, (%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: splat3_128: @@ -1988,20 +1988,20 @@ define void @splat3_128(<16 x i8> %a0, <16 x i8> %a1, ptr%a2) { ; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] -; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm2, %xmm2 -; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0 -; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpperm %xmm8, %xmm1, %xmm6, %xmm1 -; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3 -; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5 -; XOP-NEXT: vmovdqa %xmm5, 80(%rdi) +; XOP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] +; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm7, %xmm8 +; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm2, %xmm2 +; XOP-NEXT: vpperm %xmm4, %xmm7, %xmm0, %xmm0 +; XOP-NEXT: vpperm %xmm4, %xmm1, %xmm6, %xmm1 +; XOP-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vpperm %xmm4, %xmm6, %xmm5, %xmm4 +; XOP-NEXT: vmovdqa %xmm4, 80(%rdi) ; XOP-NEXT: vmovdqa %xmm3, 64(%rdi) ; XOP-NEXT: vmovdqa %xmm1, 48(%rdi) -; XOP-NEXT: vmovdqa %xmm4, 32(%rdi) +; XOP-NEXT: vmovdqa %xmm0, 32(%rdi) ; XOP-NEXT: vmovdqa %xmm2, 16(%rdi) -; XOP-NEXT: vmovdqa %xmm0, (%rdi) +; XOP-NEXT: vmovdqa %xmm8, (%rdi) ; XOP-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <64 x i32> @@ -2092,36 +2092,36 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) { ; ; AVX1-LABEL: splat3_256: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vmovdqa %xmm4, 80(%rdi) -; AVX1-NEXT: vmovdqa %xmm2, 64(%rdi) -; AVX1-NEXT: vmovdqa %xmm1, 48(%rdi) -; AVX1-NEXT: vmovdqa %xmm5, 32(%rdi) -; AVX1-NEXT: vmovdqa %xmm3, 16(%rdi) -; AVX1-NEXT: vmovdqa %xmm0, (%rdi) +; AVX1-NEXT: vmovdqa %xmm3, 64(%rdi) +; AVX1-NEXT: vmovdqa %xmm2, 48(%rdi) +; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) +; AVX1-NEXT: vmovdqa %xmm5, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2161,20 +2161,20 @@ define void @splat3_256(<32 x i8> %a0, ptr%a1) { ; XOP-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] ; XOP-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; XOP-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; XOP-NEXT: vmovdqa {{.*#+}} xmm8 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] -; XOP-NEXT: vpperm %xmm8, %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpperm %xmm8, %xmm0, %xmm7, %xmm0 -; XOP-NEXT: vpperm %xmm8, %xmm7, %xmm4, %xmm4 -; XOP-NEXT: vpperm %xmm8, %xmm2, %xmm6, %xmm2 -; XOP-NEXT: vpperm %xmm8, %xmm5, %xmm3, %xmm3 -; XOP-NEXT: vpperm %xmm8, %xmm6, %xmm5, %xmm5 -; XOP-NEXT: vmovdqa %xmm5, 80(%rdi) +; XOP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,16,11,6,17,12,7,18,13,8,19,14,9,20,15,10] +; XOP-NEXT: vpperm %xmm4, %xmm1, %xmm7, %xmm8 +; XOP-NEXT: vpperm %xmm4, %xmm0, %xmm1, %xmm1 +; XOP-NEXT: vpperm %xmm4, %xmm7, %xmm0, %xmm0 +; XOP-NEXT: vpperm %xmm4, %xmm2, %xmm6, %xmm2 +; XOP-NEXT: vpperm %xmm4, %xmm5, %xmm3, %xmm3 +; XOP-NEXT: vpperm %xmm4, %xmm6, %xmm5, %xmm4 +; XOP-NEXT: vmovdqa %xmm4, 80(%rdi) ; XOP-NEXT: vmovdqa %xmm3, 64(%rdi) ; XOP-NEXT: vmovdqa %xmm2, 48(%rdi) -; XOP-NEXT: vmovdqa %xmm4, 32(%rdi) +; XOP-NEXT: vmovdqa %xmm0, 32(%rdi) ; XOP-NEXT: vmovdqa %xmm1, 16(%rdi) -; XOP-NEXT: vmovdqa %xmm0, (%rdi) +; XOP-NEXT: vmovdqa %xmm8, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> @@ -2412,22 +2412,22 @@ define void @D107009(ptr %input, ptr %output) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3,3,3,7,7,7,7] -; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0,0,3,2] -; AVX1-NEXT: vmovshdup {{.*#+}} ymm5 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0,0,3,2] +; AVX1-NEXT: vmovshdup {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] ; AVX1-NEXT: vmovdqa %xmm1, 16(%rsi) ; AVX1-NEXT: vmovdqa %xmm7, 48(%rsi) ; AVX1-NEXT: vmovdqa %xmm6, 112(%rsi) +; AVX1-NEXT: vmovups %ymm5, 64(%rsi) ; AVX1-NEXT: vmovups %ymm0, 128(%rsi) -; AVX1-NEXT: vmovups %ymm5, 160(%rsi) -; AVX1-NEXT: vmovupd %ymm4, 192(%rsi) -; AVX1-NEXT: vmovupd %ymm3, 224(%rsi) -; AVX1-NEXT: vmovups %ymm2, 64(%rsi) +; AVX1-NEXT: vmovups %ymm4, 160(%rsi) +; AVX1-NEXT: vmovupd %ymm3, 192(%rsi) +; AVX1-NEXT: vmovupd %ymm2, 224(%rsi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -2480,22 +2480,22 @@ define void @D107009(ptr %input, ptr %output) { ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0 ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,3,3] -; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,3,3,3,7,7,7,7] -; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0,0,3,2] -; XOP-NEXT: vmovshdup {{.*#+}} ymm5 = ymm0[1,1,3,3,5,5,7,7] +; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,3,3,3,7,7,7,7] +; XOP-NEXT: vshufpd {{.*#+}} ymm3 = ymm0[0,0,3,2] +; XOP-NEXT: vmovshdup {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] +; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] +; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,3,3] +; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,3,3,3] ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] ; XOP-NEXT: vmovdqa %xmm1, 16(%rsi) ; XOP-NEXT: vmovdqa %xmm7, 48(%rsi) ; XOP-NEXT: vmovdqa %xmm6, 112(%rsi) +; XOP-NEXT: vmovups %ymm5, 64(%rsi) ; XOP-NEXT: vmovups %ymm0, 128(%rsi) -; XOP-NEXT: vmovups %ymm5, 160(%rsi) -; XOP-NEXT: vmovupd %ymm4, 192(%rsi) -; XOP-NEXT: vmovupd %ymm3, 224(%rsi) -; XOP-NEXT: vmovups %ymm2, 64(%rsi) +; XOP-NEXT: vmovups %ymm4, 160(%rsi) +; XOP-NEXT: vmovupd %ymm3, 192(%rsi) +; XOP-NEXT: vmovupd %ymm2, 224(%rsi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %i = load <64 x i32>, ptr %input, align 16 diff --git a/llvm/test/CodeGen/X86/pr46585.ll b/llvm/test/CodeGen/X86/pr46585.ll index 2ddf096683b7b..88d6c2f45c27c 100644 --- a/llvm/test/CodeGen/X86/pr46585.ll +++ b/llvm/test/CodeGen/X86/pr46585.ll @@ -7,7 +7,18 @@ define void @spam() local_unnamed_addr { ; CHECK-LABEL: spam: ; CHECK: ## %bb.0: ## %bb -; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rax +; CHECK-NEXT: movzbl (%rax), %eax +; CHECK-NEXT: andb $1, %al +; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: pxor %xmm1, %xmm1 +; CHECK-NEXT: pxor %xmm2, %xmm2 +; CHECK-NEXT: pcmpgtb %xmm0, %xmm2 +; CHECK-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; CHECK-NEXT: pmovmskb %xmm2, %eax ; CHECK-NEXT: testb %al, %al ; CHECK-NEXT: je LBB0_2 ; CHECK-NEXT: ## %bb.1: ## %bb9 diff --git a/llvm/test/CodeGen/X86/pr48727.ll b/llvm/test/CodeGen/X86/pr48727.ll index 8a8f53c4f5bdb..2599f367a7ca3 100644 --- a/llvm/test/CodeGen/X86/pr48727.ll +++ b/llvm/test/CodeGen/X86/pr48727.ll @@ -4,16 +4,19 @@ define void @PR48727() { ; CHECK-LABEL: PR48727: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vcvttpd2dqy 0, %xmm0 ; CHECK-NEXT: vcvttpd2dqy 128, %xmm1 -; CHECK-NEXT: movq (%rax), %rax ; CHECK-NEXT: vcvttpd2dqy 160, %xmm2 +; CHECK-NEXT: vpackssdw %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpackssdw %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; CHECK-NEXT: vcvttpd2dqy (%rax), %xmm2 ; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; CHECK-NEXT: vpmovdw %zmm0, %ymm0 -; CHECK-NEXT: vmovdqu %ymm0, 16(%rax) +; CHECK-NEXT: vpackssdw %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6] +; CHECK-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovdqu %ymm2, 16(%rax) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll index 6e77d3e4fd134..719689f07ae6b 100644 --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -599,27 +599,41 @@ define <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) { define fastcc void @test17() nounwind { ; X86-SSE-LABEL: test17: ; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movaps {{.*#+}} xmm0 = [u,u,32768,32768] -; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X86-SSE-NEXT: movapd %xmm0, (%eax) ; X86-SSE-NEXT: retl ; -; X86-AVX-LABEL: test17: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X86-AVX-NEXT: vmovaps %xmm0, (%eax) -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test17: +; X86-AVX1: # %bb.0: # %entry +; X86-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX1-NEXT: retl +; +; X86-AVX512-LABEL: test17: +; X86-AVX512: # %bb.0: # %entry +; X86-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X86-AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2,3] +; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX512-NEXT: retl ; ; X64-SSE-LABEL: test17: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movaps {{.*#+}} xmm0 = [u,u,32768,32768] -; X64-SSE-NEXT: movaps %xmm0, (%rax) +; X64-SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; X64-SSE-NEXT: movapd %xmm0, (%rax) ; X64-SSE-NEXT: retq ; -; X64-AVX-LABEL: test17: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] -; X64-AVX-NEXT: vmovaps %xmm0, (%rax) -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test17: +; X64-AVX1: # %bb.0: # %entry +; X64-AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X64-AVX1-NEXT: vmovaps %xmm0, (%rax) +; X64-AVX1-NEXT: retq +; +; X64-AVX512-LABEL: test17: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [32768,32768,32768,32768] +; X64-AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2,3] +; X64-AVX512-NEXT: vmovaps %xmm0, (%rax) +; X64-AVX512-NEXT: retq entry: %0 = insertelement <4 x i32> undef, i32 undef, i32 1 %1 = shufflevector <4 x i32> , <4 x i32> %0, <4 x i32> @@ -702,8 +716,3 @@ define <4 x i32> @test_mul(<4 x i32> %x, <4 x i32> %y) { %m = mul <4 x i32> %x, %y ret <4 x i32> %m } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; X64-AVX1: {{.*}} -; X64-AVX512: {{.*}} -; X86-AVX1: {{.*}} -; X86-AVX512: {{.*}} diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll index 1a4df9a175ffa..4e10b20902062 100644 --- a/llvm/test/CodeGen/X86/sse3.ll +++ b/llvm/test/CodeGen/X86/sse3.ll @@ -216,14 +216,16 @@ define void @t9(ptr %r, ptr %A) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movaps (%ecx), %xmm0 -; X86-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X86-NEXT: movaps %xmm0, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: t9: ; X64: # %bb.0: ; X64-NEXT: movaps (%rdi), %xmm0 -; X64-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: retq %tmp = load <4 x float>, ptr %r diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 283c6a303a581..e3146e69e18ae 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -1566,11 +1566,11 @@ define <16 x i16> @var_shuffle_v16i16_from_v8i16(<8 x i16> %v, <16 x i16> %indic ; AVX512VLF-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VLF-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [514,514,514,514,514,514,514,514,514,514,514,514,514,514,514,514] ; AVX512VLF-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512VLF-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLF-NEXT: vpshufb %ymm1, %ymm0, %ymm2 +; AVX512VLF-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VLF-NEXT: vpshufb %ymm1, %ymm0, %ymm3 ; AVX512VLF-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 -; AVX512VLF-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm3 ^ ymm2)) +; AVX512VLF-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3)) ; AVX512VLF-NEXT: retq ; ; AVX512VLBW-LABEL: var_shuffle_v16i16_from_v8i16: diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll index 8c64dd2d9b49f..c5cacc944a371 100644 --- a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll @@ -813,10 +813,16 @@ define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp +; X86-NEXT: movw $-3, %ax +; X86-NEXT: kmovd %eax, %k0 ; X86-NEXT: vucomish 8(%ebp), %xmm2 ; X86-NEXT: setae %al ; X86-NEXT: andl $1, %eax -; X86-NEXT: kmovw %eax, %k0 +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: kshiftrw $1, %k0, %k2 +; X86-NEXT: kshiftlw $1, %k2, %k2 +; X86-NEXT: korw %k1, %k2, %k1 +; X86-NEXT: kandw %k0, %k1, %k0 ; X86-NEXT: vpsrld $16, %xmm2, %xmm3 ; X86-NEXT: vucomish 10(%ebp), %xmm3 ; X86-NEXT: setae %al @@ -851,10 +857,16 @@ define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; ; X64-LABEL: test_v4f16_oge_q: ; X64: # %bb.0: +; X64-NEXT: movw $-3, %ax +; X64-NEXT: kmovd %eax, %k0 ; X64-NEXT: vucomish %xmm3, %xmm2 ; X64-NEXT: setae %al ; X64-NEXT: andl $1, %eax -; X64-NEXT: kmovw %eax, %k0 +; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kshiftrw $1, %k0, %k2 +; X64-NEXT: kshiftlw $1, %k2, %k2 +; X64-NEXT: korw %k1, %k2, %k1 +; X64-NEXT: kandw %k0, %k1, %k0 ; X64-NEXT: vpsrld $16, %xmm3, %xmm4 ; X64-NEXT: vpsrld $16, %xmm2, %xmm5 ; X64-NEXT: vucomish %xmm4, %xmm5 @@ -901,11 +913,17 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $16, %esp +; X86-NEXT: movw $-3, %ax +; X86-NEXT: kmovd %eax, %k0 ; X86-NEXT: vmovsh {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm2, %xmm3 ; X86-NEXT: seta %al ; X86-NEXT: andl $1, %eax -; X86-NEXT: kmovw %eax, %k0 +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: kshiftrw $1, %k0, %k2 +; X86-NEXT: kshiftlw $1, %k2, %k2 +; X86-NEXT: korw %k1, %k2, %k1 +; X86-NEXT: kandw %k0, %k1, %k0 ; X86-NEXT: vpsrld $16, %xmm2, %xmm3 ; X86-NEXT: vmovsh {{.*#+}} xmm4 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vcomish %xmm3, %xmm4 @@ -943,10 +961,16 @@ define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, < ; ; X64-LABEL: test_v4f16_olt_q: ; X64: # %bb.0: +; X64-NEXT: movw $-3, %ax +; X64-NEXT: kmovd %eax, %k0 ; X64-NEXT: vcomish %xmm2, %xmm3 ; X64-NEXT: seta %al ; X64-NEXT: andl $1, %eax -; X64-NEXT: kmovw %eax, %k0 +; X64-NEXT: kmovw %eax, %k1 +; X64-NEXT: kshiftrw $1, %k0, %k2 +; X64-NEXT: kshiftlw $1, %k2, %k2 +; X64-NEXT: korw %k1, %k2, %k1 +; X64-NEXT: kandw %k0, %k1, %k0 ; X64-NEXT: vpsrld $16, %xmm2, %xmm4 ; X64-NEXT: vpsrld $16, %xmm3, %xmm5 ; X64-NEXT: vcomish %xmm4, %xmm5 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll index bde14e75dfc04..eae0a93745b52 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll @@ -428,6 +428,17 @@ define <4 x i1> @strict_vector_fptosi_v4f16_to_v4i1(<4 x half> %a) #0 { ; NOVL-NEXT: vcvttsh2si %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: kmovw %eax, %k0 +; NOVL-NEXT: kshiftrw $1, %k0, %k1 +; NOVL-NEXT: kshiftlw $1, %k1, %k1 +; NOVL-NEXT: korw %k0, %k1, %k0 +; NOVL-NEXT: kshiftlw $12, %k0, %k0 +; NOVL-NEXT: kshiftrw $12, %k0, %k0 +; NOVL-NEXT: kshiftrw $4, %k0, %k1 +; NOVL-NEXT: kshiftlw $4, %k1, %k1 +; NOVL-NEXT: korw %k0, %k1, %k0 +; NOVL-NEXT: movw $-3, %ax +; NOVL-NEXT: kmovd %eax, %k1 +; NOVL-NEXT: kandw %k1, %k0, %k0 ; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; NOVL-NEXT: vcvttsh2si %xmm1, %eax ; NOVL-NEXT: kmovd %eax, %k1 @@ -440,7 +451,8 @@ define <4 x i1> @strict_vector_fptosi_v4f16_to_v4i1(<4 x half> %a) #0 { ; NOVL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; NOVL-NEXT: vcvttsh2si %xmm1, %eax ; NOVL-NEXT: kmovd %eax, %k1 -; NOVL-NEXT: kshiftlw $2, %k1, %k1 +; NOVL-NEXT: kshiftlw $15, %k1, %k1 +; NOVL-NEXT: kshiftrw $13, %k1, %k1 ; NOVL-NEXT: korw %k1, %k0, %k0 ; NOVL-NEXT: kshiftlw $13, %k0, %k0 ; NOVL-NEXT: kshiftrw $13, %k0, %k0 @@ -474,6 +486,17 @@ define <4 x i1> @strict_vector_fptoui_v4f16_to_v4i1(<4 x half> %a) #0 { ; NOVL-NEXT: vcvttsh2si %xmm0, %eax ; NOVL-NEXT: andl $1, %eax ; NOVL-NEXT: kmovw %eax, %k0 +; NOVL-NEXT: kshiftrw $1, %k0, %k1 +; NOVL-NEXT: kshiftlw $1, %k1, %k1 +; NOVL-NEXT: korw %k0, %k1, %k0 +; NOVL-NEXT: kshiftlw $12, %k0, %k0 +; NOVL-NEXT: kshiftrw $12, %k0, %k0 +; NOVL-NEXT: kshiftrw $4, %k0, %k1 +; NOVL-NEXT: kshiftlw $4, %k1, %k1 +; NOVL-NEXT: korw %k0, %k1, %k0 +; NOVL-NEXT: movw $-3, %ax +; NOVL-NEXT: kmovd %eax, %k1 +; NOVL-NEXT: kandw %k1, %k0, %k0 ; NOVL-NEXT: vpsrld $16, %xmm0, %xmm1 ; NOVL-NEXT: vcvttsh2si %xmm1, %eax ; NOVL-NEXT: kmovd %eax, %k1 @@ -486,7 +509,8 @@ define <4 x i1> @strict_vector_fptoui_v4f16_to_v4i1(<4 x half> %a) #0 { ; NOVL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; NOVL-NEXT: vcvttsh2si %xmm1, %eax ; NOVL-NEXT: kmovd %eax, %k1 -; NOVL-NEXT: kshiftlw $2, %k1, %k1 +; NOVL-NEXT: kshiftlw $15, %k1, %k1 +; NOVL-NEXT: kshiftrw $13, %k1, %k1 ; NOVL-NEXT: korw %k1, %k0, %k0 ; NOVL-NEXT: kshiftlw $13, %k0, %k0 ; NOVL-NEXT: kshiftrw $13, %k0, %k0 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll index 48a0b27a207f3..57f223e824e2e 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -1829,43 +1829,43 @@ define <2 x i16> @strict_vector_fptosi_v2f64_to_v2i16(<2 x double> %a) #0 { ; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; SSE-32: # %bb.0: ; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; SSE-64: # %bb.0: ; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #0 @@ -1888,31 +1888,31 @@ define <2 x i16> @strict_vector_fptoui_v2f64_to_v2i16(<2 x double> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f64(<2 x double> %a, metadata !"fpexcept.strict") #0 @@ -1924,49 +1924,49 @@ define <2 x i16> @strict_vector_fptosi_v2f32_to_v2i16(<2 x float> %a) #0 { ; SSE-32: # %bb.0: ; SSE-32-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-32-NEXT: packssdw %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; SSE-64: # %bb.0: ; SSE-64-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-64-NEXT: packssdw %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-64-NEXT: retq ; ; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f32(<2 x float> %a, metadata !"fpexcept.strict") #0 @@ -1992,35 +1992,35 @@ define <2 x i16> @strict_vector_fptoui_v2f32_to_v2i16(<2 x float> %a) #0 { ; AVX: # %bb.0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: ret{{[l|q]}} ; ; AVX512F-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VL-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: ; AVX512VLDQ: # %bb.0: ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 ; AVX512VLDQ-NEXT: ret{{[l|q]}} %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f32(<2 x float> %a, metadata !"fpexcept.strict") #0 @@ -2052,8 +2052,7 @@ define <2 x i8> @strict_vector_fptosi_v2f64_to_v2i8(<2 x double> %a) #0 { ; AVX512F-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i8: @@ -2065,8 +2064,7 @@ define <2 x i8> @strict_vector_fptosi_v2f64_to_v2i8(<2 x double> %a) #0 { ; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: @@ -2104,8 +2102,7 @@ define <2 x i8> @strict_vector_fptoui_v2f64_to_v2i8(<2 x double> %a) #0 { ; AVX512F-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i8: @@ -2117,8 +2114,7 @@ define <2 x i8> @strict_vector_fptoui_v2f64_to_v2i8(<2 x double> %a) #0 { ; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: @@ -2160,8 +2156,7 @@ define <2 x i8> @strict_vector_fptosi_v2f32_to_v2i8(<2 x float> %a) #0 { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i8: @@ -2175,8 +2170,7 @@ define <2 x i8> @strict_vector_fptosi_v2f32_to_v2i8(<2 x float> %a) #0 { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptosi_v2f32_to_v2i8: @@ -2219,8 +2213,7 @@ define <2 x i8> @strict_vector_fptoui_v2f32_to_v2i8(<2 x float> %a) #0 { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i8: @@ -2234,8 +2227,7 @@ define <2 x i8> @strict_vector_fptoui_v2f32_to_v2i8(<2 x float> %a) #0 { ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512VLDQ-LABEL: strict_vector_fptoui_v2f32_to_v2i8: diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll index 179e8ad69672b..da6cc74941fb0 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1517,28 +1517,32 @@ define <8 x i8> @strict_vector_fptosi_v8f32_to_v8i8(<8 x float> %a) #0 { ; AVX512F-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512DQVL-LABEL: strict_vector_fptosi_v8f32_to_v8i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f32(<8 x float> %a, @@ -1559,28 +1563,32 @@ define <8 x i8> @strict_vector_fptoui_v8f32_to_v8i8(<8 x float> %a) #0 { ; AVX512F-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512VL-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} ; ; AVX512DQVL-LABEL: strict_vector_fptoui_v8f32_to_v8i8: ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512DQVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vzeroupper ; AVX512DQVL-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f32(<8 x float> %a, diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll index ce5db5b246775..c297fa7f3e69a 100644 --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -678,14 +678,16 @@ define <8 x i8> @strict_vector_fptosi_v8f64_to_v8i8(<8 x double> %a) #0 { ; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f64(<8 x double> %a, @@ -697,14 +699,16 @@ define <8 x i8> @strict_vector_fptoui_v8f64_to_v8i8(<8 x double> %a) #0 { ; AVX512VL-LABEL: strict_vector_fptoui_v8f64_to_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: ret{{[l|q]}} ; ; AVX512DQ-LABEL: strict_vector_fptoui_v8f64_to_v8i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: ret{{[l|q]}} %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f64(<8 x double> %a, diff --git a/llvm/test/CodeGen/X86/vec_cast3.ll b/llvm/test/CodeGen/X86/vec_cast3.ll index 43bb538186403..6f124cc5fb34b 100644 --- a/llvm/test/CodeGen/X86/vec_cast3.ll +++ b/llvm/test/CodeGen/X86/vec_cast3.ll @@ -79,7 +79,7 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) { ; CHECK-LABEL: cvt_v2f32_v2i16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res @@ -109,7 +109,7 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) { ; CHECK-LABEL: cvt_v2f32_v2u16: ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index df2dc77dc1259..86b66706d53a0 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2262,8 +2262,7 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { ; AVX512F-LABEL: fptosi_2f32_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f32_to_2i8: @@ -2275,8 +2274,7 @@ define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { ; AVX512DQ-LABEL: fptosi_2f32_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f32_to_2i8: @@ -2292,14 +2290,38 @@ define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { ; SSE-LABEL: fptosi_2f32_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f32_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; VEX-LABEL: fptosi_2f32_to_2i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 +; VEX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f32_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f32_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f32_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f32_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %cvt = fptosi <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2322,8 +2344,7 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { ; AVX512F-LABEL: fptoui_2f32_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f32_to_2i8: @@ -2335,8 +2356,7 @@ define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { ; AVX512DQ-LABEL: fptoui_2f32_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i8: @@ -2355,11 +2375,35 @@ define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_2f32_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; VEX-LABEL: fptoui_2f32_to_2i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 +; VEX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f32_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f32_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f32_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f32_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %cvt = fptoui <2 x float> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2382,8 +2426,7 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { ; AVX512F-LABEL: fptosi_2f64_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptosi_2f64_to_2i8: @@ -2395,8 +2438,7 @@ define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { ; AVX512DQ-LABEL: fptosi_2f64_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptosi_2f64_to_2i8: @@ -2412,14 +2454,38 @@ define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { ; SSE-LABEL: fptosi_2f64_to_2i16: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: fptosi_2f64_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; VEX-LABEL: fptosi_2f64_to_2i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VEX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptosi_2f64_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptosi_2f64_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptosi_2f64_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptosi_2f64_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %cvt = fptosi <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } @@ -2442,8 +2508,7 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { ; AVX512F-LABEL: fptoui_2f64_to_2i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fptoui_2f64_to_2i8: @@ -2455,8 +2520,7 @@ define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { ; AVX512DQ-LABEL: fptoui_2f64_to_2i8: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: retq ; ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i8: @@ -2475,11 +2539,35 @@ define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: fptoui_2f64_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq +; VEX-LABEL: fptoui_2f64_to_2i16: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; VEX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; VEX-NEXT: retq +; +; AVX512F-LABEL: fptoui_2f64_to_2i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: fptoui_2f64_to_2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512DQ-LABEL: fptoui_2f64_to_2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: retq +; +; AVX512VLDQ-LABEL: fptoui_2f64_to_2i16: +; AVX512VLDQ: # %bb.0: +; AVX512VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VLDQ-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VLDQ-NEXT: retq %cvt = fptoui <2 x double> %a to <2 x i16> ret <2 x i16> %cvt } diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 910dd1ee6c419..857317c6cff02 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -4225,28 +4225,29 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; AVX1-LABEL: uitofp_load_4i64_to_4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %ymm0 -; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrlq $1, %xmm2, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 -; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 +; AVX1-NEXT: vpsrlq $1, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 +; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm2 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 -; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm1, %xmm0 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 +; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] +; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vblendvps %xmm0, %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -4660,7 +4661,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vpsrlq $1, %xmm5, %xmm6 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm4 +; AVX1-NEXT: vblendvpd %ymm4, %ymm3, %ymm1, %ymm3 ; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vmovq %xmm3, %rax @@ -4682,7 +4684,8 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vpsrlq $1, %xmm4, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3 +; AVX1-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-NEXT: vmovq %xmm2, %rax diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index 894186f9b343b..41ad21bbe50c3 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -2369,7 +2369,7 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp -; AVX512F-NEXT: subq $256, %rsp # imm = 0x100 +; AVX512F-NEXT: subq $320, %rsp # imm = 0x140 ; AVX512F-NEXT: movzbl 352(%rbp), %eax ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k0 @@ -2379,42 +2379,43 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-5, %ax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: kmovw %k1, %k3 +; AVX512F-NEXT: kmovw %eax, %k7 +; AVX512F-NEXT: kandw %k7, %k0, %k0 ; AVX512F-NEXT: movzbl 368(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-9, %ax -; AVX512F-NEXT: kmovw %eax, %k5 -; AVX512F-NEXT: kandw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw %eax, %k6 +; AVX512F-NEXT: kandw %k6, %k0, %k0 ; AVX512F-NEXT: movzbl 376(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $12, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-17, %ax -; AVX512F-NEXT: kmovw %eax, %k6 -; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %k1, %k4 ; AVX512F-NEXT: movzbl 384(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $11, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-33, %ax -; AVX512F-NEXT: kmovw %eax, %k7 -; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: movzbl 392(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $10, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-65, %ax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %eax, %k3 +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: movzbl 400(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 @@ -2466,9 +2467,9 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kshiftrw $4, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: movzbl 448(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 @@ -2482,369 +2483,410 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $2, %k1, %k1 -; AVX512F-NEXT: korw %k1, %k0, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512F-NEXT: kmovw %eax, %k4 -; AVX512F-NEXT: kandw %k4, %k1, %k1 -; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: movzbl 464(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $14, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kshiftlw $1, %k1, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $14, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $1, %k0, %k0 ; AVX512F-NEXT: movzbl 472(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 232(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: movzbl 224(%rbp), %eax ; AVX512F-NEXT: andl $1, %eax -; AVX512F-NEXT: movzbl 232(%rbp), %r10d -; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $14, %k1, %k1 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k3, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: movzbl 240(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $13, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k5, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: movzbl 248(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $12, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k6, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k4, %k0, %k0 ; AVX512F-NEXT: movzbl 256(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $11, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k7, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512F-NEXT: kandw %k5, %k0, %k0 ; AVX512F-NEXT: movzbl 264(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $10, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 ; AVX512F-NEXT: movzbl 272(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $9, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 ; AVX512F-NEXT: movzbl 280(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k0, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512F-NEXT: kandw %k3, %k0, %k0 ; AVX512F-NEXT: movzbl 288(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k0, %k2 -; AVX512F-NEXT: kshiftrw $7, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 296(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $6, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 304(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $5, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 312(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $4, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 320(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $3, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 328(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $2, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k4, %k1, %k1 -; AVX512F-NEXT: movzbl 336(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $14, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kshiftlw $1, %k1, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k1 -; AVX512F-NEXT: movzbl 344(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 296(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movzbl 96(%rbp), %eax -; AVX512F-NEXT: andl $1, %eax -; AVX512F-NEXT: movzbl 104(%rbp), %r10d +; AVX512F-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 304(%rbp), %r10d ; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $14, %k1, %k1 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 112(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $13, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k5, %k4 -; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k5, %k1, %k1 -; AVX512F-NEXT: movzbl 120(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $12, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k6, %k5 -; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k6, %k1, %k1 -; AVX512F-NEXT: movzbl 128(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $11, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k7, %k6 -; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 136(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 144(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $9, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 152(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $8, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 160(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $7, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 168(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $6, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 176(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $5, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 184(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $4, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 192(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $3, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 200(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $2, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 208(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $14, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kshiftlw $1, %k1, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k1 -; AVX512F-NEXT: movzbl 216(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: andl $1, %edi -; AVX512F-NEXT: kmovw %esi, %k1 +; AVX512F-NEXT: kshiftrw $5, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 312(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $14, %k1, %k1 -; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $4, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: movzbl 320(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $3, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 328(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %edx, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $13, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k4, %k1, %k1 -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $12, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k5, %k1, %k1 -; AVX512F-NEXT: kmovw %r8d, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $11, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k6, %k1, %k1 -; AVX512F-NEXT: kmovw %r9d, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 16(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: movzbl 336(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftlw $14, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-NEXT: movzbl 344(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 96(%rbp), %r10d +; AVX512F-NEXT: andl $1, %r10d +; AVX512F-NEXT: movzbl 104(%rbp), %r11d +; AVX512F-NEXT: kmovw %r11d, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 112(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: movzbl 120(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 128(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k5, %k0, %k0 +; AVX512F-NEXT: movzbl 136(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $10, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: movzbl 144(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 152(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: movzbl 160(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 168(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 176(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $5, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 184(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $4, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 192(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $3, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: movzbl 200(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: movzbl 208(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $14, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-NEXT: movzbl 216(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: andl $1, %edi +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %r8d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw %r9d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $10, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: movzbl 16(%rbp), %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 24(%rbp), %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 32(%rbp), %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 40(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 48(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $5, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 56(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k7 +; AVX512F-NEXT: kshiftrw $4, %k7, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 64(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k6 +; AVX512F-NEXT: kshiftrw $3, %k6, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: movzbl 72(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k3 +; AVX512F-NEXT: kshiftrw $2, %k3, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k2, %k0, %k1 +; AVX512F-NEXT: movzbl 80(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: kshiftlw $14, %k2, %k4 +; AVX512F-NEXT: korw %k4, %k1, %k1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k4 +; AVX512F-NEXT: movzbl 88(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k5 +; AVX512F-NEXT: korw %k5, %k4, %k4 +; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movw $-3, %dx +; AVX512F-NEXT: andl $1, %ecx +; AVX512F-NEXT: kshiftrw $1, %k0, %k4 +; AVX512F-NEXT: kshiftlw $1, %k4, %k0 +; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw %ecx, %k4 +; AVX512F-NEXT: korw %k4, %k0, %k4 +; AVX512F-NEXT: kshiftrw $8, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kunpckbw %k4, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k4 +; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kshiftrw $14, %k4, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kshiftrw $13, %k4, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k7, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kshiftrw $11, %k6, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: kshiftrw $10, %k3, %k3 +; AVX512F-NEXT: korw %k3, %k0, %k0 ; AVX512F-NEXT: kshiftlw $15, %k2, %k2 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 ; AVX512F-NEXT: kshiftrw $9, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k2, %k0, %k0 +; AVX512F-NEXT: kshiftlw $9, %k0, %k0 +; AVX512F-NEXT: kshiftrw $9, %k0, %k0 +; AVX512F-NEXT: kshiftlw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kandw %k1, %k2, %k1 -; AVX512F-NEXT: movzbl 24(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $8, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 32(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $7, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512F-NEXT: korw %k1, %k5, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: kunpckbw %k1, %k2, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 40(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $6, %k2, %k2 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 48(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k5 -; AVX512F-NEXT: kshiftrw $5, %k5, %k2 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512F-NEXT: kandw %k3, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 56(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k4 -; AVX512F-NEXT: kshiftrw $4, %k4, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k7, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k6, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k4, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 64(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k3 -; AVX512F-NEXT: kshiftrw $3, %k3, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 72(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $2, %k2, %k0 -; AVX512F-NEXT: korw %k0, %k1, %k0 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 80(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kshiftlw $14, %k1, %k7 -; AVX512F-NEXT: korw %k7, %k0, %k0 -; AVX512F-NEXT: kshiftlw $1, %k0, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k7 -; AVX512F-NEXT: movzbl 88(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: kshiftlw $15, %k0, %k6 -; AVX512F-NEXT: korw %k6, %k7, %k6 -; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movw $-3, %ax -; AVX512F-NEXT: kmovw %eax, %k6 -; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kandw %k6, %k7, %k6 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kshiftrw $14, %k7, %k7 -; AVX512F-NEXT: korw %k7, %k6, %k6 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kandw %k7, %k6, %k6 -; AVX512F-NEXT: kshiftrw $13, %k5, %k5 -; AVX512F-NEXT: korw %k5, %k6, %k5 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512F-NEXT: kandw %k6, %k5, %k5 -; AVX512F-NEXT: kshiftrw $12, %k4, %k4 -; AVX512F-NEXT: korw %k4, %k5, %k4 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512F-NEXT: kandw %k5, %k4, %k4 -; AVX512F-NEXT: kshiftrw $11, %k3, %k3 -; AVX512F-NEXT: korw %k3, %k4, %k3 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512F-NEXT: kandw %k4, %k3, %k3 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k3, %k2 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512F-NEXT: kandw %k3, %k2, %k2 -; AVX512F-NEXT: kshiftlw $6, %k1, %k1 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kshiftlw $9, %k1, %k1 -; AVX512F-NEXT: kshiftrw $9, %k1, %k1 -; AVX512F-NEXT: kshiftlw $7, %k0, %k0 -; AVX512F-NEXT: korw %k0, %k1, %k0 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftlw $9, %k1, %k1 ; AVX512F-NEXT: kshiftrw $9, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -2857,32 +2899,42 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftrw $1, %k0, %k1 ; AVX512F-NEXT: kxorw %k1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512F-NEXT: kmovw %k0, %ecx +; AVX512F-NEXT: andl $1, %eax +; AVX512F-NEXT: kmovw %eax, %k0 +; AVX512F-NEXT: kmovw %k5, %k7 +; AVX512F-NEXT: korw %k0, %k5, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512F-NEXT: kandw %k6, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: kandw %k2, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $12, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512F-NEXT: kandw %k5, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $11, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512F-NEXT: kandw %k4, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $10, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: kandw %k3, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kshiftlw $6, %k1, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftlw $9, %k0, %k0 ; AVX512F-NEXT: kshiftrw $9, %k0, %k0 @@ -2890,6 +2942,28 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kshiftlw $7, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: korw %k1, %k7, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: kunpckbw %k1, %k7, %k1 +; AVX512F-NEXT: kandw %k6, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: korw %k7, %k1, %k1 +; AVX512F-NEXT: kandw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: kandw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k5, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k4, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k3, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kshiftlw $9, %k1, %k1 ; AVX512F-NEXT: kshiftrw $9, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -2902,7 +2976,7 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftrw $1, %k0, %k1 ; AVX512F-NEXT: kxorw %k1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %ecx +; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512F-NEXT: kxorw %k2, %k3, %k0 @@ -2933,16 +3007,16 @@ define <64 x i8> @test_compress_v64i8(<64 x i8> %vec, <64 x i1> %mask, <64 x i8> ; AVX512F-NEXT: vpcompressd %zmm0, %zmm0 {%k2} {z} ; AVX512F-NEXT: vpternlogd {{.*#+}} zmm8 {%k2} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm6, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vpmovdb %zmm0, 64(%rsp,%rax) -; AVX512F-NEXT: vpmovdb %zmm3, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: andl $31, %ecx -; AVX512F-NEXT: vpmovdb %zmm2, 96(%rsp,%rcx) +; AVX512F-NEXT: vpmovdb %zmm0, 128(%rsp,%rcx) +; AVX512F-NEXT: vpmovdb %zmm3, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vpmovdb %zmm2, 160(%rsp,%rax) ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: andl $63, %edx ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm0 -; AVX512F-NEXT: vmovaps %ymm0, 128(%rsp,%rdx) +; AVX512F-NEXT: vmovaps %ymm0, 192(%rsp,%rdx) ; AVX512F-NEXT: vpmovdb %zmm4, %xmm0 ; AVX512F-NEXT: vpmovdb %zmm5, %xmm2 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 @@ -3745,7 +3819,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512F-NEXT: pushq %rbp ; AVX512F-NEXT: movq %rsp, %rbp ; AVX512F-NEXT: andq $-64, %rsp -; AVX512F-NEXT: subq $640, %rsp # imm = 0x280 +; AVX512F-NEXT: subq $704, %rsp # imm = 0x2C0 ; AVX512F-NEXT: movzbl 352(%rbp), %eax ; AVX512F-NEXT: andl $1, %eax ; AVX512F-NEXT: kmovw %eax, %k0 @@ -3764,463 +3838,506 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: movw $-9, %ax +; AVX512F-NEXT: kmovw %eax, %k7 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 376(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-17, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %k1, %k4 +; AVX512F-NEXT: movzbl 384(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-33, %ax ; AVX512F-NEXT: kmovw %eax, %k5 ; AVX512F-NEXT: kandw %k5, %k0, %k0 -; AVX512F-NEXT: movzbl 376(%rbp), %eax +; AVX512F-NEXT: movzbl 392(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $10, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-65, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 400(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-129, %ax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 408(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-257, %ax # imm = 0xFEFF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 416(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-513, %ax # imm = 0xFDFF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 424(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-1025, %ax # imm = 0xFBFF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 432(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $5, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-2049, %ax # imm = 0xF7FF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 440(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $4, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-4097, %ax # imm = 0xEFFF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 448(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $3, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-8193, %ax # imm = 0xDFFF +; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 456(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 464(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $14, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-NEXT: movzbl 472(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 232(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 224(%rbp), %eax +; AVX512F-NEXT: andl $1, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 240(%rbp), %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movzbl 248(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-17, %ax -; AVX512F-NEXT: kmovw %eax, %k6 -; AVX512F-NEXT: kandw %k6, %k0, %k0 -; AVX512F-NEXT: movzbl 384(%rbp), %eax +; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 256(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-33, %ax -; AVX512F-NEXT: kmovw %eax, %k7 -; AVX512F-NEXT: kandw %k7, %k0, %k0 -; AVX512F-NEXT: movzbl 392(%rbp), %eax +; AVX512F-NEXT: kandw %k5, %k0, %k0 +; AVX512F-NEXT: movzbl 264(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $10, %k1, %k1 -; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-65, %ax -; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 400(%rbp), %eax +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: movzbl 272(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $9, %k1, %k1 -; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-129, %ax -; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 408(%rbp), %eax +; AVX512F-NEXT: movzbl 280(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-257, %ax # imm = 0xFEFF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 416(%rbp), %eax +; AVX512F-NEXT: movzbl 288(%rbp), %eax ; AVX512F-NEXT: kmovw %eax, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $7, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-513, %ax # imm = 0xFDFF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 424(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: movzbl 296(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftrw $6, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-1025, %ax # imm = 0xFBFF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 432(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: movzbl 304(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftrw $5, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-2049, %ax # imm = 0xF7FF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 440(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: movzbl 312(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftrw $4, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-4097, %ax # imm = 0xEFFF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 448(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: movzbl 320(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftrw $3, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: movw $-8193, %ax # imm = 0xDFFF -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 456(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: movzbl 328(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftrw $2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: movzbl 336(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftlw $14, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-NEXT: movzbl 344(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k1 -; AVX512F-NEXT: movw $-16385, %ax # imm = 0xBFFF -; AVX512F-NEXT: kmovw %eax, %k4 -; AVX512F-NEXT: kandw %k4, %k1, %k1 -; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movzbl 464(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $14, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kshiftlw $1, %k1, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k1 -; AVX512F-NEXT: movzbl 472(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movzbl 224(%rbp), %eax -; AVX512F-NEXT: andl $1, %eax -; AVX512F-NEXT: movzbl 232(%rbp), %r10d +; AVX512F-NEXT: movzbl 96(%rbp), %r10d +; AVX512F-NEXT: andl $1, %r10d +; AVX512F-NEXT: movzbl 104(%rbp), %r11d +; AVX512F-NEXT: kmovw %r11d, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: movzbl 112(%rbp), %r10d ; AVX512F-NEXT: kmovw %r10d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $14, %k1, %k1 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 240(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $13, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k5, %k1, %k1 -; AVX512F-NEXT: movzbl 248(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $12, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k6, %k1, %k1 -; AVX512F-NEXT: movzbl 256(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $11, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 264(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 272(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $9, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k0 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 120(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 128(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k5, %k0, %k0 +; AVX512F-NEXT: movzbl 136(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $10, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: movzbl 144(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 152(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 160(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: movzbl 168(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 176(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $5, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 184(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $4, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 192(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $3, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 200(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k2, %k0, %k0 +; AVX512F-NEXT: movzbl 208(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $14, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftlw $1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $1, %k0, %k0 +; AVX512F-NEXT: movzbl 216(%rbp), %r10d +; AVX512F-NEXT: kmovw %r10d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $14, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movzbl 280(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: andl $1, %edi +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $13, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $12, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %r8d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $11, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw %r9d, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $10, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: movzbl 16(%rbp), %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: movzbl 24(%rbp), %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $8, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k0, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 288(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k0, %k2 -; AVX512F-NEXT: kshiftrw $7, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 296(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $6, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 304(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 -; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $5, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 312(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 +; AVX512F-NEXT: kshiftrw $8, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 32(%rbp), %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kshiftrw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: movzbl 40(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $6, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: movzbl 48(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kshiftrw $5, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 56(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k7 +; AVX512F-NEXT: kshiftrw $4, %k7, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 64(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k6 +; AVX512F-NEXT: kshiftrw $3, %k6, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: movzbl 72(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k3 +; AVX512F-NEXT: kshiftrw $2, %k3, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kandw %k2, %k0, %k1 +; AVX512F-NEXT: movzbl 80(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: kshiftlw $14, %k2, %k4 +; AVX512F-NEXT: korw %k4, %k1, %k1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k4 +; AVX512F-NEXT: movzbl 88(%rbp), %edx +; AVX512F-NEXT: kmovw %edx, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k5 +; AVX512F-NEXT: korw %k5, %k4, %k4 +; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: movw $-3, %dx +; AVX512F-NEXT: andl $1, %ecx +; AVX512F-NEXT: kshiftrw $1, %k0, %k4 +; AVX512F-NEXT: kshiftlw $1, %k4, %k0 ; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $4, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 320(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k0 +; AVX512F-NEXT: kmovw %ecx, %k4 +; AVX512F-NEXT: korw %k4, %k0, %k4 +; AVX512F-NEXT: kshiftrw $8, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $3, %k0, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 328(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $2, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k4, %k1, %k1 -; AVX512F-NEXT: movzbl 336(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $14, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kshiftlw $1, %k1, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k1 -; AVX512F-NEXT: movzbl 344(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movzbl 96(%rbp), %eax -; AVX512F-NEXT: andl $1, %eax -; AVX512F-NEXT: movzbl 104(%rbp), %r10d -; AVX512F-NEXT: kmovw %r10d, %k1 -; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $14, %k1, %k1 -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 112(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $13, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k5, %k4 -; AVX512F-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k5, %k1, %k1 -; AVX512F-NEXT: movzbl 120(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $12, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k6, %k5 -; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k6, %k1, %k1 -; AVX512F-NEXT: movzbl 128(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $11, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k7, %k6 -; AVX512F-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 136(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kunpckbw %k4, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k4 +; AVX512F-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kshiftrw $14, %k4, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kshiftrw $13, %k4, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 +; AVX512F-NEXT: kshiftrw $12, %k7, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 144(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 +; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kshiftrw $11, %k6, %k5 +; AVX512F-NEXT: korw %k5, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: kshiftrw $10, %k3, %k3 +; AVX512F-NEXT: korw %k3, %k0, %k0 ; AVX512F-NEXT: kshiftlw $15, %k2, %k2 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512F-NEXT: kandw %k4, %k0, %k0 ; AVX512F-NEXT: kshiftrw $9, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 152(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $8, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 160(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $7, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 168(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $6, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k2, %k0, %k0 +; AVX512F-NEXT: kshiftlw $9, %k0, %k0 +; AVX512F-NEXT: kshiftrw $9, %k0, %k0 +; AVX512F-NEXT: kshiftlw $7, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512F-NEXT: korw %k1, %k5, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 176(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $5, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kunpckbw %k1, %k2, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 184(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $4, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 192(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $3, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 200(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $2, %k2, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload ; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 208(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $14, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kshiftlw $1, %k1, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k1 -; AVX512F-NEXT: movzbl 216(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: andl $1, %edi -; AVX512F-NEXT: kmovw %esi, %k1 -; AVX512F-NEXT: kshiftlw $15, %k1, %k1 -; AVX512F-NEXT: kshiftrw $14, %k1, %k1 -; AVX512F-NEXT: kmovw %edi, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: kmovw %edx, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $13, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k4, %k1, %k1 -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $12, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k5, %k1, %k1 -; AVX512F-NEXT: kmovw %r8d, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $11, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k6, %k1, %k1 -; AVX512F-NEXT: kmovw %r9d, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k7, %k1, %k1 -; AVX512F-NEXT: movzbl 16(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $9, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kandw %k1, %k2, %k1 -; AVX512F-NEXT: movzbl 24(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $8, %k2, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload ; AVX512F-NEXT: kandw %k3, %k1, %k1 -; AVX512F-NEXT: movzbl 32(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $7, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 40(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kshiftrw $6, %k2, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k7, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 48(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k5 -; AVX512F-NEXT: kshiftrw $5, %k5, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k6, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 56(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k4 -; AVX512F-NEXT: kshiftrw $4, %k4, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k4, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512F-NEXT: kandw %k2, %k1, %k1 -; AVX512F-NEXT: movzbl 64(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k3 -; AVX512F-NEXT: kshiftrw $3, %k3, %k2 ; AVX512F-NEXT: korw %k2, %k1, %k1 -; AVX512F-NEXT: kandw %k0, %k1, %k1 -; AVX512F-NEXT: movzbl 72(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: kshiftlw $15, %k2, %k2 -; AVX512F-NEXT: kshiftrw $2, %k2, %k0 -; AVX512F-NEXT: korw %k0, %k1, %k0 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kandw %k1, %k0, %k0 -; AVX512F-NEXT: movzbl 80(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: kshiftlw $14, %k1, %k7 -; AVX512F-NEXT: korw %k7, %k0, %k0 -; AVX512F-NEXT: kshiftlw $1, %k0, %k0 -; AVX512F-NEXT: kshiftrw $1, %k0, %k7 -; AVX512F-NEXT: movzbl 88(%rbp), %eax -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: kshiftlw $15, %k0, %k6 -; AVX512F-NEXT: korw %k6, %k7, %k6 -; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: movw $-3, %ax -; AVX512F-NEXT: kmovw %eax, %k6 -; AVX512F-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kandw %k6, %k7, %k6 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kshiftrw $14, %k7, %k7 -; AVX512F-NEXT: korw %k7, %k6, %k6 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload -; AVX512F-NEXT: kandw %k7, %k6, %k6 -; AVX512F-NEXT: kshiftrw $13, %k5, %k5 -; AVX512F-NEXT: korw %k5, %k6, %k5 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload -; AVX512F-NEXT: kandw %k6, %k5, %k5 -; AVX512F-NEXT: kshiftrw $12, %k4, %k4 -; AVX512F-NEXT: korw %k4, %k5, %k4 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload -; AVX512F-NEXT: kandw %k5, %k4, %k4 -; AVX512F-NEXT: kshiftrw $11, %k3, %k3 -; AVX512F-NEXT: korw %k3, %k4, %k3 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload -; AVX512F-NEXT: kandw %k4, %k3, %k3 -; AVX512F-NEXT: kshiftrw $10, %k2, %k2 -; AVX512F-NEXT: korw %k2, %k3, %k2 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload -; AVX512F-NEXT: kandw %k3, %k2, %k2 -; AVX512F-NEXT: kshiftlw $6, %k1, %k1 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kshiftlw $9, %k1, %k1 -; AVX512F-NEXT: kshiftrw $9, %k1, %k1 -; AVX512F-NEXT: kshiftlw $7, %k0, %k0 -; AVX512F-NEXT: korw %k0, %k1, %k0 -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftlw $9, %k1, %k1 ; AVX512F-NEXT: kshiftrw $9, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -4233,34 +4350,44 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftrw $1, %k0, %k1 ; AVX512F-NEXT: kxorw %k1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %eax +; AVX512F-NEXT: kmovw %k0, %ecx ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: vpcompressd %zmm2, %zmm2 {%k1} {z} -; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512F-NEXT: andl $1, %eax +; AVX512F-NEXT: kmovw %eax, %k0 +; AVX512F-NEXT: kmovw %k5, %k7 +; AVX512F-NEXT: korw %k0, %k5, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kandw %k1, %k0, %k0 +; AVX512F-NEXT: kunpckbw %k0, %k1, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload +; AVX512F-NEXT: kandw %k6, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: kandw %k7, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: kandw %k2, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: kandw %k6, %k0, %k0 +; AVX512F-NEXT: kandw %k3, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $12, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload ; AVX512F-NEXT: kandw %k5, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $11, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload ; AVX512F-NEXT: kandw %k4, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: kshiftrw $10, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 -; AVX512F-NEXT: kandw %k3, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-NEXT: kshiftlw $6, %k1, %k1 +; AVX512F-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512F-NEXT: kandw %k3, %k0, %k0 +; AVX512F-NEXT: kshiftrw $9, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftlw $9, %k0, %k0 ; AVX512F-NEXT: kshiftrw $9, %k0, %k0 @@ -4268,6 +4395,28 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512F-NEXT: kshiftlw $7, %k1, %k1 ; AVX512F-NEXT: korw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512F-NEXT: korw %k1, %k7, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: kunpckbw %k1, %k7, %k1 +; AVX512F-NEXT: kandw %k6, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512F-NEXT: korw %k7, %k1, %k1 +; AVX512F-NEXT: kandw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: kandw %k2, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k5, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k4, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: kandw %k3, %k1, %k1 +; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; AVX512F-NEXT: korw %k2, %k1, %k1 ; AVX512F-NEXT: kshiftlw $9, %k1, %k1 ; AVX512F-NEXT: kshiftrw $9, %k1, %k1 ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -4280,7 +4429,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kshiftrw $1, %k0, %k1 ; AVX512F-NEXT: kxorw %k1, %k0, %k0 -; AVX512F-NEXT: kmovw %k0, %ecx +; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload ; AVX512F-NEXT: vpcompressd %zmm3, %zmm3 {%k1} {z} ; AVX512F-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload @@ -4298,20 +4447,20 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ; AVX512F-NEXT: kxorw %k1, %k0, %k0 ; AVX512F-NEXT: kmovw %k0, %edx ; AVX512F-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: andl $31, %eax -; AVX512F-NEXT: vmovdqa64 %zmm1, 64(%rsp,%rax,4) -; AVX512F-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: andl $31, %ecx -; AVX512F-NEXT: vmovdqa64 %zmm3, 192(%rsp,%rcx,4) +; AVX512F-NEXT: vmovdqa64 %zmm1, 128(%rsp,%rcx,4) +; AVX512F-NEXT: vmovdqa64 %zmm2, {{[0-9]+}}(%rsp) +; AVX512F-NEXT: andl $31, %eax +; AVX512F-NEXT: vmovdqa64 %zmm3, 256(%rsp,%rax,4) ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1 ; AVX512F-NEXT: vmovaps %zmm0, {{[0-9]+}}(%rsp) ; AVX512F-NEXT: andl $63, %edx ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2 -; AVX512F-NEXT: vmovaps %zmm0, 320(%rsp,%rdx,4) +; AVX512F-NEXT: vmovaps %zmm0, 384(%rsp,%rdx,4) ; AVX512F-NEXT: vmovaps %zmm1, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vmovaps %zmm2, 384(%rsp,%rdx,4) +; AVX512F-NEXT: vmovaps %zmm2, 448(%rsp,%rdx,4) ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm0 ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm1 ; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %zmm2 @@ -4663,8 +4812,16 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) nounwind ; AVX512F-LABEL: test_compress_narrow: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512F-NEXT: movw $-3, %ax +; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: andl $1, %edi -; AVX512F-NEXT: kmovw %edi, %k0 +; AVX512F-NEXT: kmovw %edi, %k1 +; AVX512F-NEXT: kshiftrw $1, %k0, %k2 +; AVX512F-NEXT: kshiftlw $1, %k2, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kandw %k0, %k1, %k0 ; AVX512F-NEXT: kmovw %esi, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 @@ -4688,8 +4845,16 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) nounwind ; ; AVX512VL-LABEL: test_compress_narrow: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movw $-3, %ax +; AVX512VL-NEXT: kmovd %eax, %k0 ; AVX512VL-NEXT: andl $1, %edi -; AVX512VL-NEXT: kmovw %edi, %k0 +; AVX512VL-NEXT: kmovw %edi, %k1 +; AVX512VL-NEXT: kshiftrw $1, %k0, %k2 +; AVX512VL-NEXT: kshiftlw $1, %k2, %k2 +; AVX512VL-NEXT: korw %k1, %k2, %k1 +; AVX512VL-NEXT: kshiftlw $4, %k0, %k2 +; AVX512VL-NEXT: korw %k1, %k2, %k1 +; AVX512VL-NEXT: kandw %k0, %k1, %k0 ; AVX512VL-NEXT: kmovd %esi, %k1 ; AVX512VL-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VL-NEXT: kshiftrw $14, %k1, %k1 @@ -4736,8 +4901,16 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; ; AVX512F-LABEL: test_compress_narrow_illegal_element_type: ; AVX512F: # %bb.0: +; AVX512F-NEXT: movw $-3, %ax +; AVX512F-NEXT: kmovw %eax, %k0 ; AVX512F-NEXT: andl $1, %ecx -; AVX512F-NEXT: kmovw %ecx, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: kshiftrw $1, %k0, %k2 +; AVX512F-NEXT: kshiftlw $1, %k2, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kshiftlw $4, %k0, %k2 +; AVX512F-NEXT: korw %k1, %k2, %k1 +; AVX512F-NEXT: kandw %k0, %k1, %k0 ; AVX512F-NEXT: kmovw %r8d, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $14, %k1, %k1 @@ -4769,8 +4942,16 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; ; AVX512VL-LABEL: test_compress_narrow_illegal_element_type: ; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movw $-3, %ax +; AVX512VL-NEXT: kmovd %eax, %k0 ; AVX512VL-NEXT: andl $1, %ecx -; AVX512VL-NEXT: kmovw %ecx, %k0 +; AVX512VL-NEXT: kmovw %ecx, %k1 +; AVX512VL-NEXT: kshiftrw $1, %k0, %k2 +; AVX512VL-NEXT: kshiftlw $1, %k2, %k2 +; AVX512VL-NEXT: korw %k1, %k2, %k1 +; AVX512VL-NEXT: kshiftlw $4, %k0, %k2 +; AVX512VL-NEXT: korw %k1, %k2, %k1 +; AVX512VL-NEXT: kandw %k0, %k1, %k0 ; AVX512VL-NEXT: kmovd %r8d, %k1 ; AVX512VL-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VL-NEXT: kshiftrw $14, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 13f7d68ccb893..c30dffb40ef6b 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -907,6 +907,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,64,64,32,32,32,128,128,64] ; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7] ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index da902b3aed5ab..2aaa42f6c3c8b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -814,28 +814,31 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,1] +; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5],xmm8[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] -; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0],xmm9[1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7] -; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6],xmm5[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovaps %ymm2, (%rsi) ; AVX-NEXT: vmovaps %ymm7, (%rdx) @@ -1462,34 +1465,33 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i16_stride3_vf32: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm5 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm6[1],xmm2[2,3],xmm6[4],xmm2[5,6],xmm6[7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm7 -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 +; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,2,1] +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 176(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 160(%rdi), %xmm8 -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7] -; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm10 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1,2],xmm10[3,4,5,6,7] +; AVX-NEXT: vmovdqa 176(%rdi), %xmm8 +; AVX-NEXT: vmovdqa 160(%rdi), %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm13 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm4 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm10 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7] @@ -1498,51 +1500,63 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,1,2,1] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm6[1],xmm14[2,3],xmm6[4],xmm14[5,6],xmm6[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] ; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7] -; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1,2],xmm1[3],xmm13[4,5],xmm1[6],xmm13[7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] +; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm13 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7] ; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5],xmm12[6],xmm13[7] -; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm13 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7] +; AVX-NEXT: vpshufb %xmm1, %xmm13, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm2[2],xmm6[3,4],xmm2[5],xmm6[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3],mem[4],xmm1[5,6],mem[7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3],xmm12[4],xmm2[5,6],xmm12[7] ; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6],xmm12[7] -; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm3, (%rsi) -; AVX-NEXT: vmovaps %ymm0, 32(%rdx) +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm2, (%rsi) +; AVX-NEXT: vmovaps %ymm15, 32(%rdx) ; AVX-NEXT: vmovaps %ymm14, (%rdx) -; AVX-NEXT: vmovaps %ymm2, 32(%rcx) -; AVX-NEXT: vmovaps %ymm1, (%rcx) +; AVX-NEXT: vmovaps %ymm1, 32(%rcx) +; AVX-NEXT: vmovaps %ymm0, (%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1811,37 +1825,38 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3,4,5,6,7] +; AVX512-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX512-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm9 ^ ymm10)) ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm5 ^ ymm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-NEXT: vpshufb %ymm11, %ymm8, %ymm8 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm8[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9)) ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1851,17 +1866,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4] ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5)) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm10 ^ ymm9)) ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -1871,7 +1886,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1889,37 +1904,38 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm9 ^ ymm10)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm5 ^ ymm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm8[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -1929,17 +1945,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm10 ^ ymm9)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -1949,7 +1965,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq @@ -1967,37 +1983,38 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm9 ^ ymm10)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm5 ^ ymm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm8, %ymm8 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm8[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -2007,17 +2024,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm10 ^ ymm9)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -2027,7 +2044,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -2045,37 +2062,38 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm10 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm9 ^ ymm10)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm5 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm8[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] @@ -2085,17 +2103,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm8 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm10 ^ ymm9)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 @@ -2105,7 +2123,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -2693,214 +2711,240 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i16_stride3_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $408, %rsp # imm = 0x198 -; AVX-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX-NEXT: subq $440, %rsp # imm = 0x1B8 +; AVX-NEXT: vmovdqa 176(%rdi), %xmm8 +; AVX-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7] +; AVX-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm13 +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,3,2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 368(%rdi), %xmm14 -; AVX-NEXT: vmovdqa 352(%rdi), %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,1,2,1] +; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 368(%rdi), %xmm10 +; AVX-NEXT: vmovdqa 352(%rdi), %xmm5 +; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7] -; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa 336(%rdi), %xmm9 -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 288(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa 320(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 320(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm10[2],xmm5[3,4],xmm10[5],xmm5[6,7] +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vmovdqa 336(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 208(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 192(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7] ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa 240(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX-NEXT: vmovdqa 224(%rdi), %xmm3 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa 224(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1] -; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 80(%rdi), %xmm13 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] -; AVX-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7] +; AVX-NEXT: vmovdqa 272(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 256(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vmovdqa 240(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 80(%rdi), %xmm9 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm9[2],xmm14[3,4],xmm9[5],xmm14[6,7] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm6 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] +; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] +; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm15 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm4[1],xmm15[2,3],xmm4[4],xmm15[5,6],xmm4[7] -; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,2,1] -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm12[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6],xmm8[7] +; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm8[0,1],mem[2],xmm8[3,4],mem[5],xmm8[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,14,15,8,9,14,15,12,13,14,15] +; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,1],xmm1[2],mem[3,4],xmm1[5],mem[6,7] +; AVX-NEXT: vmovdqa %xmm11, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm2[1,2],xmm11[3],xmm2[4,5],xmm11[6],xmm2[7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7] -; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7] -; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0,1],mem[2],xmm12[3,4],mem[5],xmm12[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7] -; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm10[0,1],mem[2],xmm10[3,4],mem[5],xmm10[6,7] +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm12, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = xmm11[0,1],mem[2],xmm11[3,4],mem[5],xmm11[6,7] +; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0],xmm3[1,2],mem[3],xmm3[4,5],mem[6],xmm3[7] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1,2],xmm1[3],xmm12[4,5],xmm1[6],xmm12[7] -; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm14[2],xmm9[3,4],xmm14[5],xmm9[6,7] +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] +; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm9[0,1],mem[2],xmm9[3,4],mem[5],xmm9[6,7] +; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1,2],xmm14[3],xmm1[4,5],xmm14[6],xmm1[7] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0,1],xmm12[2],mem[3,4],xmm12[5],mem[6,7] -; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7] -; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7] -; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3],mem[4],xmm12[5,6],mem[7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm12 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm8 +; AVX-NEXT: vpshufb %xmm12, %xmm15, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = mem[0,1],xmm5[2],mem[3,4],xmm5[5],mem[6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6],xmm14[7] -; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 -; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload -; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7] -; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm7 -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7] -; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7] -; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7] -; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2],xmm0[3,4],mem[5],xmm0[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6],xmm8[7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; AVX-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[0],xmm10[1],mem[2,3],xmm10[4],mem[5,6],xmm10[7] +; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1],xmm11[2],mem[3,4],xmm11[5],mem[6,7] +; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7] +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm8 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3],mem[4],xmm3[5,6],mem[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7] ; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7] +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0],xmm9[1],mem[2,3],xmm9[4],mem[5,6],xmm9[7] +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm14[1],xmm4[2,3],xmm14[4],xmm4[5,6],xmm14[7] ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, 64(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, 96(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, 64(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, 96(%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm2, 32(%rdx) -; AVX-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 64(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 96(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 32(%rsi) +; AVX-NEXT: vmovaps %ymm12, 64(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, (%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX-NEXT: vmovaps %ymm3, (%rcx) -; AVX-NEXT: vmovaps %ymm0, 96(%rcx) -; AVX-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX-NEXT: addq $408, %rsp # imm = 0x198 +; AVX-NEXT: vmovaps %ymm8, 96(%rcx) +; AVX-NEXT: vmovaps %ymm2, 32(%rcx) +; AVX-NEXT: addq $440, %rsp # imm = 0x1B8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -3425,657 +3469,665 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512-NEXT: vmovdqa 272(%rdi), %xmm5 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX512-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-NEXT: vmovdqa 304(%rdi), %xmm6 +; AVX512-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm20 +; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm21 +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX512-NEXT: vmovdqa64 112(%rdi), %xmm17 +; AVX512-NEXT: vmovdqa64 96(%rdi), %xmm24 +; AVX512-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm1[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm19 ^ (ymm7 & (ymm18 ^ ymm19)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7],ymm2[8,9,10],ymm7[11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm16 +; AVX512-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm15 ^ ymm14)) +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm13 ^ (ymm3 & (ymm7 ^ ymm13)) +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm14 ^ ymm15)) +; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm15 ^ (ymm1 & (ymm14 ^ ymm15)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7],ymm2[8],ymm14[9,10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm13 ^ ymm7)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm4 & (ymm7 ^ ymm13)) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX512-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm20 ^ (ymm1 & (ymm21 ^ ymm20)) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7],ymm2[8],ymm1[9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm13[2],xmm6[3,4],xmm13[5],xmm6[6,7] +; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm19 ^ (ymm4 & (ymm18 ^ ymm19)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX512-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm8[1],xmm15[2,3],xmm8[4],xmm15[5,6],xmm8[7] +; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm19 ^ ymm18)) +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20)) +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2,3],ymm5[4],ymm14[5,6],ymm5[7],ymm14[8],ymm5[9],ymm14[10,11],ymm5[12],ymm14[13,14],ymm5[15] +; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm21 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 112(%rdi), %xmm17 +; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm19 ^ (ymm7 & (ymm18 ^ ymm19)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7],ymm2[8,9,10],ymm7[11,12,13,14,15] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm15 ^ ymm14)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm13 ^ (ymm3 & (ymm7 ^ ymm13)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm14 ^ ymm15)) +; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm15 ^ (ymm1 & (ymm14 ^ ymm15)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7],ymm2[8],ymm14[9,10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm13 ^ ymm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm4 & (ymm7 ^ ymm13)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm20 ^ (ymm1 & (ymm21 ^ ymm20)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7],ymm2[8],ymm1[9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm13[2],xmm6[3,4],xmm13[5],xmm6[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm19 ^ (ymm4 & (ymm18 ^ ymm19)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm8[1],xmm15[2,3],xmm8[4],xmm15[5,6],xmm8[7] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm19 ^ ymm18)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2,3],ymm5[4],ymm14[5,6],ymm5[7],ymm14[8],ymm5[9],ymm14[10,11],ymm5[12],ymm14[13,14],ymm5[15] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride3_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm5 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm6 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm20 +; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm21 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX512DQ-NEXT: vmovdqa64 112(%rdi), %xmm17 +; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %xmm24 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm19 ^ (ymm7 & (ymm18 ^ ymm19)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7],ymm2[8,9,10],ymm7[11,12,13,14,15] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm16 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm15 ^ ymm14)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm13 ^ (ymm3 & (ymm7 ^ ymm13)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm14 ^ ymm15)) +; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm15 ^ (ymm1 & (ymm14 ^ ymm15)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7],ymm2[8],ymm14[9,10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm13 ^ ymm7)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm4 & (ymm7 ^ ymm13)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm20 ^ (ymm1 & (ymm21 ^ ymm20)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7],ymm2[8],ymm1[9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm13[2],xmm6[3,4],xmm13[5],xmm6[6,7] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm19 ^ (ymm4 & (ymm18 ^ ymm19)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm8[1],xmm15[2,3],xmm8[4],xmm15[5,6],xmm8[7] +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm19 ^ ymm18)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2,3],ymm5[4],ymm14[5,6],ymm5[7],ymm14[8],ymm5[9],ymm14[10,11],ymm5[12],ymm14[13,14],ymm5[15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 112(%rdi), %xmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %xmm24 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm6[2],xmm9[3,4],xmm6[5],xmm9[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm19 ^ (ymm7 & (ymm18 ^ ymm19)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3,4,5,6,7],ymm2[8,9,10],ymm7[11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm2 & (ymm15 ^ ymm14)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7,8,9],ymm12[10],ymm2[11,12],ymm12[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm13 ^ (ymm3 & (ymm7 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm15 ^ (ymm0 & (ymm14 ^ ymm15)) +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,6,7,4] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm15 ^ (ymm1 & (ymm14 ^ ymm15)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm14[1,2],ymm2[3],ymm14[4,5],ymm2[6],ymm14[7],ymm2[8],ymm14[9,10],ymm2[11],ymm14[12,13],ymm2[14],ymm14[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm13 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm4 & (ymm7 ^ ymm13)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm24 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm20 ^ (ymm1 & (ymm21 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7],ymm2[8],ymm1[9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm13[2],xmm6[3,4],xmm13[5],xmm6[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm19 ^ (ymm4 & (ymm18 ^ ymm19)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm8[1],xmm15[2,3],xmm8[4],xmm15[5,6],xmm8[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm19 ^ ymm18)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm14[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2,3],ymm5[4],ymm14[5,6],ymm5[7],ymm14[8],ymm5[9],ymm14[10,11],ymm5[12],ymm14[13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1],xmm12[2],xmm11[3,4],xmm12[5],xmm11[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2],ymm5[3,4,5,6,7],ymm4[8,9,10],ymm5[11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 01aacc1e06258..9122267be4724 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -562,14 +562,18 @@ define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-LABEL: load_i16_stride4_vf8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -612,14 +616,18 @@ define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FP-LABEL: load_i16_stride4_vf8: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -660,14 +668,18 @@ define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX2-FCP-LABEL: load_i16_stride4_vf8: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -1021,26 +1033,30 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-LABEL: load_i16_stride4_vf16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3 @@ -1125,26 +1141,30 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FP-LABEL: load_i16_stride4_vf16: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 +; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FP-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -1225,11 +1245,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15] ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -2088,46 +2112,54 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i16_stride4_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $168, %rsp -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm3, %xmm3 +; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill @@ -2320,46 +2352,54 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i16_stride4_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $184, %rsp -; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FP-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FP-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm6, %xmm6 +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FP-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2542,149 +2582,157 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: subq $104, %rsp ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6,7],ymm4[8],ymm8[9,10,11],ymm4[12],ymm8[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1,2,3],ymm2[4],ymm8[5,6,7],ymm2[8],ymm8[9,10,11],ymm2[12],ymm8[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm5 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm6 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8 -; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm10 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm9 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 -; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm11 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1,2,3],ymm1[4],ymm8[5,6,7],ymm1[8],ymm8[9,10,11],ymm1[12],ymm8[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7],ymm0[8],ymm8[9,10,11],ymm0[12],ymm8[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm8 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm15 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm8 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13 -; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm11 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] +; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm11 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm4 -; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm10 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1] +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm12 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm3 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm4 -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm7 +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm7 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,3,1,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx) -; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) -; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r8) +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) +; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rcx) +; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8) ; AVX2-FCP-NEXT: addq $104, %rsp ; AVX2-FCP-NEXT: vzeroupper @@ -4300,86 +4348,102 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i16_stride4_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $696, %rsp # imm = 0x2B8 -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX2-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-NEXT: vpackusdw %xmm10, %xmm9, %xmm9 +; AVX2-NEXT: vpackusdw %xmm9, %xmm9, %xmm9 +; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 +; AVX2-NEXT: vpackusdw %xmm10, %xmm10, %xmm10 +; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm2[1,2,3],ymm8[4],ymm2[5,6,7],ymm8[8],ymm2[9,10,11],ymm8[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-NEXT: vpackusdw %xmm10, %xmm8, %xmm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm2[1,2,3],ymm7[4],ymm2[5,6,7],ymm7[8],ymm2[9,10,11],ymm7[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-NEXT: vpackusdw %xmm10, %xmm7, %xmm7 +; AVX2-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-NEXT: vpackusdw %xmm7, %xmm7, %xmm7 +; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-NEXT: vpackusdw %xmm9, %xmm8, %xmm8 +; AVX2-NEXT: vpackusdw %xmm8, %xmm8, %xmm8 +; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm2[1,2,3],ymm6[4],ymm2[5,6,7],ymm6[8],ymm2[9,10,11],ymm6[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-NEXT: vpackusdw %xmm8, %xmm6, %xmm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7],ymm5[8],ymm2[9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-NEXT: vpackusdw %xmm8, %xmm5, %xmm5 +; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm6, %xmm6 +; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6,7],ymm4[8],ymm2[9,10,11],ymm4[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7],ymm3[8],ymm2[9,10,11],ymm3[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-NEXT: vpackusdw %xmm6, %xmm3, %xmm3 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm4 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4804,86 +4868,102 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i16_stride4_vf64: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FP-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX2-FP-NEXT: vpackusdw %xmm10, %xmm9, %xmm9 +; AVX2-FP-NEXT: vpackusdw %xmm9, %xmm9, %xmm9 +; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX2-FP-NEXT: vpackusdw %xmm11, %xmm10, %xmm10 +; AVX2-FP-NEXT: vpackusdw %xmm10, %xmm10, %xmm10 +; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm2[1,2,3],ymm8[4],ymm2[5,6,7],ymm8[8],ymm2[9,10,11],ymm8[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX2-FP-NEXT: vpackusdw %xmm10, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm2[1,2,3],ymm7[4],ymm2[5,6,7],ymm7[8],ymm2[9,10,11],ymm7[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm10 +; AVX2-FP-NEXT: vpackusdw %xmm10, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 +; AVX2-FP-NEXT: vpackusdw %xmm8, %xmm7, %xmm7 +; AVX2-FP-NEXT: vpackusdw %xmm7, %xmm7, %xmm7 +; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX2-FP-NEXT: vpackusdw %xmm9, %xmm8, %xmm8 +; AVX2-FP-NEXT: vpackusdw %xmm8, %xmm8, %xmm8 +; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm2[1,2,3],ymm6[4],ymm2[5,6,7],ymm6[8],ymm2[9,10,11],ymm6[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX2-FP-NEXT: vpackusdw %xmm8, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7],ymm5[8],ymm2[9,10,11],ymm5[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm8 +; AVX2-FP-NEXT: vpackusdw %xmm8, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FP-NEXT: vpackusdw %xmm5, %xmm5, %xmm5 +; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7 +; AVX2-FP-NEXT: vpackusdw %xmm7, %xmm6, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm6, %xmm6 +; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6,7],ymm4[8],ymm2[9,10,11],ymm4[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm2[1,2,3],ymm3[4],ymm2[5,6,7],ymm3[8],ymm2[9,10,11],ymm3[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FP-NEXT: vpackusdw %xmm6, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 -; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2 -; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15] -; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = mem[0],ymm2[1,2,3],mem[4],ymm2[5,6,7],mem[8],ymm2[9,10,11],mem[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FP-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm4, %xmm4 +; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] +; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5279,246 +5359,262 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX2-FCP-LABEL: load_i16_stride4_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FCP-NEXT: subq $664, %rsp # imm = 0x298 +; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6] -; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX2-FCP-NEXT: vpackusdw %xmm7, %xmm5, %xmm5 -; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1,2,3],ymm3[4],ymm1[5,6,7],ymm3[8],ymm1[9,10,11],ymm3[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 +; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm3, %xmm3 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7],ymm2[8],ymm1[9,10,11],ymm2[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 +; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm6 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6] +; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm5, %xmm3 -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm14 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm7 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15] -; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29] +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 +; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm8 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm1[1,2,3],ymm5[4],ymm1[5,6,7],ymm5[8],ymm1[9,10,11],ymm5[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm5, %xmm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX2-FCP-NEXT: vpackusdw %xmm6, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm5 +; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm6 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm4, %xmm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm9 -; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm3 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm12 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm5 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm12 +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3],ymm4[4],ymm1[5,6,7],ymm4[8],ymm1[9,10,11],ymm4[12],ymm1[13,14,15] +; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 +; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm6 +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm4 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm0 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm1 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm10 -; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm14 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm13 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm10 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm14 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm15 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm9 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm9 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm13 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm14 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm11 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm9 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm13 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm6 -; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm14 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm9[0],xmm14[1],xmm9[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm9 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm9 +; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm8 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm0 +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm5 -; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm3 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm11 -; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] -; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm4 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm14 +; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3] +; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7] -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload +; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm12 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3] -; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[3,1,2,3] +; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm6[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm14 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm11 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX2-FCP-NEXT: # xmm9 = mem[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload +; AVX2-FCP-NEXT: # xmm10 = mem[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3] -; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,0,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[3,1,2,3] +; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[3,1,2,3] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[2,0,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm0 -; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm5 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm4 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,3] ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,3] @@ -5529,9 +5625,9 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] @@ -5546,21 +5642,21 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm1 -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm1 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7] @@ -5571,9 +5667,9 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7] @@ -5581,7 +5677,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[3,1,2,3,4,5,6,7] ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -5613,7 +5709,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r8) ; AVX2-FCP-NEXT: vmovdqa %ymm15, (%r8) -; AVX2-FCP-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FCP-NEXT: addq $664, %rsp # imm = 0x298 ; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 9b19ec15c6f55..6814fc1d7cf60 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1515,117 +1515,119 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX-LABEL: load_i16_stride5_vf16: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1] -; AVX-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm3 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX-NEXT: vandps %ymm11, %ymm9, %ymm12 -; AVX-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1] -; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13 -; AVX-NEXT: vorps %ymm13, %ymm12, %ymm12 -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] -; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX-NEXT: vandps %ymm4, %ymm11, %ymm6 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,0,1] +; AVX-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5,4,7] +; AVX-NEXT: vmovdqa 112(%rdi), %xmm7 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm8[1],xmm7[1] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm8 +; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm10[2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm9, %ymm9 +; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9 +; AVX-NEXT: vorps %ymm6, %ymm9, %ymm6 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] -; AVX-NEXT: vpsllq $48, %xmm9, %xmm13 -; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13 -; AVX-NEXT: vpsrlq $48, %xmm4, %xmm14 -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3] +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6 +; AVX-NEXT: vpsrlq $48, %xmm1, %xmm12 +; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3,4,5,6,7] +; AVX-NEXT: vandps %ymm11, %ymm12, %ymm12 +; AVX-NEXT: vpsllq $48, %xmm4, %xmm13 +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,3,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] -; AVX-NEXT: vandps %ymm11, %ymm14, %ymm11 -; AVX-NEXT: vorps %ymm13, %ymm11, %ymm11 +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 +; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm11 +; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11 +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2,3],xmm10[4,5],xmm9[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4],xmm12[5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u] -; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] +; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm8[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[0,1,2,0] ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 -; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u] -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm14 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] +; AVX-NEXT: vpsrlq $48, %xmm8, %xmm14 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2,3],xmm9[4,5],xmm10[6,7] ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovaps %ymm10, (%rsi) +; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,1,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3],xmm5[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm9[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5],xmm0[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX-NEXT: vmovaps %ymm6, (%rsi) ; AVX-NEXT: vmovaps %ymm11, (%rdx) ; AVX-NEXT: vmovaps %ymm12, (%rcx) ; AVX-NEXT: vmovaps %ymm13, (%r8) @@ -2891,305 +2893,301 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-LABEL: load_i16_stride5_vf32: ; AVX: # %bb.0: ; AVX-NEXT: subq $424, %rsp # imm = 0x1A8 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm10 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 80(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm5 -; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa (%rdi), %xmm9 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm12 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 48(%rdi), %xmm15 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] -; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] +; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm7 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[0,1,0,3] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6,7] +; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm14 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 -; AVX-NEXT: vmovaps 64(%rdi), %xmm5 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1] -; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4 -; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 288(%rdi), %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7] -; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,2,3] +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 176(%rdi), %xmm10 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[3,1,2,3] +; AVX-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,3] +; AVX-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; AVX-NEXT: vmovdqa 256(%rdi), %xmm15 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vmovdqa 192(%rdi), %xmm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 -; AVX-NEXT: vmovaps 224(%rdi), %xmm1 -; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] -; AVX-NEXT: vandnps %ymm8, %ymm6, %ymm8 -; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vmovaps %ymm5, %ymm3 +; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa 224(%rdi), %xmm5 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1] +; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 +; AVX-NEXT: vandnps %ymm2, %ymm3, %ymm2 +; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vmovdqa 304(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 288(%rdi), %xmm2 +; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm8[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa %xmm11, %xmm6 -; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpsrlq $48, %xmm12, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vmovaps %ymm3, %ymm2 +; AVX-NEXT: vandps %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vpsllq $48, %xmm14, %xmm9 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm9 +; AVX-NEXT: vandnps %ymm9, %ymm3, %ymm9 +; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3],xmm3[4,5],xmm14[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm9[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpsrlq $48, %xmm10, %xmm1 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpsrlq $48, %xmm12, %xmm9 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] -; AVX-NEXT: vmovdqa %xmm15, %xmm12 +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = xmm11[0,1],mem[2,3],xmm11[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm15[2,3],xmm9[4,5,6,7] ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3 -; AVX-NEXT: vpsllq $48, %xmm5, %xmm9 -; AVX-NEXT: vandnps %ymm9, %ymm10, %ymm9 -; AVX-NEXT: vorps %ymm3, %ymm9, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7] -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpsllq $48, %xmm5, %xmm8 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm8 +; AVX-NEXT: vandnps %ymm8, %ymm2, %ymm5 +; AVX-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm4[4,5],xmm2[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1],xmm14[2,3],xmm3[4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3,4,5],xmm4[6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] +; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm8[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm8, %xmm8 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX-NEXT: vandps %ymm1, %ymm8, %ymm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm2[2,3],xmm8[4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1,2,3],xmm15[4,5],xmm9[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7] +; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7] +; AVX-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpsllq $48, %xmm14, %xmm3 -; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,2,0] +; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] +; AVX-NEXT: vpsrlq $48, %xmm13, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0] -; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovdqa %xmm5, %xmm9 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vmovdqa %xmm15, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmovdqa %xmm10, %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0] -; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm0 -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm3[2,3],xmm13[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4,5],xmm4[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm9[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm3[4,5],xmm8[6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] -; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm2, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa %xmm11, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm7, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm15[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm12[4,5],xmm6[6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,4,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,6] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,3,2,3] +; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] +; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[2,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7] +; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[1,1,1,1] ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX-NEXT: # xmm5 = mem[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,3,2,3] -; AVX-NEXT: vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] -; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX-NEXT: # xmm9 = mem[2,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] +; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] +; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3],xmm3[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX-NEXT: vpblendw $207, (%rsp), %xmm15, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,1,2,3],xmm15[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3198,13 +3196,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps %ymm3, 32(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm3, (%rdx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, 32(%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm0, (%rcx) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rcx) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rcx) ; AVX-NEXT: vmovaps %ymm0, 32(%r8) -; AVX-NEXT: vmovaps %ymm7, (%r8) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm0, (%r8) ; AVX-NEXT: vmovaps %ymm2, 32(%r9) ; AVX-NEXT: vmovaps %ymm1, (%r9) ; AVX-NEXT: addq $424, %rsp # imm = 0x1A8 @@ -3859,738 +3857,764 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride5_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15] +; AVX512-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] +; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] -; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa64 176(%rdi), %xmm20 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15] -; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm15 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3] -; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15)) -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-NEXT: vpsrlq $48, %xmm20, %xmm15 -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0],xmm12[1],xmm11[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandnq %zmm6, %zmm20, %zmm13 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm4 & zmm20) +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1],ymm4[2],ymm6[3],ymm4[4],ymm6[5,6],ymm4[7],ymm6[8,9],ymm4[10],ymm6[11],ymm4[12],ymm6[13,14],ymm4[15] ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u] -; AVX512-NEXT: vpor %ymm0, %ymm13, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] -; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0)) -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15] -; AVX512-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4],xmm15[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm16 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX512-NEXT: vmovdqa64 %xmm15, %xmm22 -; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] -; AVX512-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] +; AVX512-NEXT: vpsrlq $48, %xmm9, %xmm13 +; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm11[0,1],xmm12[2],xmm11[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm17 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15] -; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13)) -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm13 +; AVX512-NEXT: vpandnq %zmm13, %zmm20, %zmm13 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm14[1],ymm3[2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6],xmm1[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u] +; AVX512-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & zmm20) | zmm13 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12],ymm4[13],ymm6[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2],xmm5[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm14[2],ymm3[3],ymm14[4],ymm3[5,6],ymm14[7],ymm3[8,9],ymm14[10],ymm3[11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4],xmm5[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13],ymm2[14],ymm14[15] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm5 & zmm20) | zmm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4],xmm13[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm20 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm11[2],xmm12[3] +; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm5[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1,2,3],xmm5[4,5],xmm13[6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],xmm10[2],xmm9[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6],ymm5[7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride5_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12)) -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] -; AVX512-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [2,4,7,1,4,6,0,0] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [8,9,3,2,4,0,0,0] +; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm10, %ymm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,1,0,0,3,5,0] +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandnq %zmm10, %zmm13, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm9 & zmm13) +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1],ymm9[2],ymm11[3],ymm9[4],ymm11[5,6],ymm9[7],ymm11[8,9],ymm9[10],ymm11[11],ymm9[12],ymm11[13,14],ymm9[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm17 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,0,0,0,4,7,1,6] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm14 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm4, %xmm15 +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,2,5,7,4,7,0,0] +; AVX512-FCP-NEXT: vpermd %ymm15, %ymm16, %ymm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,3,2,0,1,3,6,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm15, %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm20 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpandnq %zmm10, %zmm13, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm12 & zmm13) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15] +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [1,4,6,3,1,4,6,3] +; AVX512-FCP-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm18 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4],xmm10[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,2,0,0,5,7,2,4] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,3,5,2,5,7,0,0] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm12[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0] +; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vpandnq %zmm14, %zmm13, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm10 & zmm13) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,3,0,0,5,0,2,7] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6)) -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,6,0,5,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0] +; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm10)) +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3,4,5,6,7],ymm10[8],ymm14[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,6,3,6,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride5_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm7 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa64 176(%rdi), %xmm20 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm11, %ymm12, %ymm15 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7 -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpsrlq $48, %xmm20, %xmm15 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[3,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm11 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0],xmm12[1],xmm11[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandnq %zmm6, %zmm20, %zmm13 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm4 & zmm20) +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1],ymm4[2],ymm6[3],ymm4[4],ymm6[5,6],ymm4[7],ymm6[8,9],ymm4[10],ymm6[11],ymm4[12],ymm6[13,14],ymm4[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm0, %ymm13, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13 -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm20 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4],xmm15[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm16 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm13 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm22 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm14, %xmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2],ymm13[3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] +; AVX512DQ-NEXT: vpsrlq $48, %xmm9, %xmm13 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm11[0,1],xmm12[2],xmm11[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9 -; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm17 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm13 +; AVX512DQ-NEXT: vpandnq %zmm13, %zmm20, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm21 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm14[1],ymm3[2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6],xmm1[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & zmm20) | zmm13 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4],ymm4[5],ymm6[6,7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12],ymm4[13],ymm6[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2],xmm5[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm17 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,1,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm11[1],xmm12[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm14[2],ymm3[3],ymm14[4],ymm3[5,6],ymm14[7],ymm3[8,9],ymm14[10],ymm3[11],ymm14[12],ymm3[13,14],ymm14[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm19 +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4],xmm5[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm14 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13],ymm2[14],ymm14[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3,4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm5 & zmm20) | zmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm13 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4],xmm13[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm20 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0],xmm10[1],xmm9[2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm12[0,1],xmm11[2],xmm12[3] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm15 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm14[1],ymm15[2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10],ymm14[11],ymm15[12,13],ymm14[14],ymm15[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm13 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1,2,3],xmm5[4,5],xmm13[6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1],xmm10[2],xmm9[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4],ymm3[5,6],ymm5[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1} -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] -; AVX512DQ-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [2,4,7,1,4,6,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [8,9,3,2,4,0,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm10, %ymm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,1,0,0,3,5,0] +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandnq %zmm10, %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm9 & zmm13) +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm11[0,1],ymm9[2],ymm11[3],ymm9[4],ymm11[5,6],ymm9[7],ymm11[8,9],ymm9[10],ymm11[11],ymm9[12],ymm11[13,14],ymm9[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm17 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,0,0,0,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm14, %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm14 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm4, %xmm15 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,2,5,7,4,7,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm16, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,3,2,0,1,3,6,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm15, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm20 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vpandnq %zmm10, %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm12 & zmm13) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4],ymm9[5],ymm11[6,7],ymm9[8],ymm11[9,10],ymm9[11],ymm11[12],ymm9[13],ymm11[14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm16 = [1,4,6,3,1,4,6,3] +; AVX512DQ-FCP-NEXT: # ymm16 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm18 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4],xmm10[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,2,0,0,5,7,2,4] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm14[2],xmm4[2],xmm14[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,3,5,2,5,7,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm12[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [1,4,6,0,1,4,6,0] +; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vpandnq %zmm14, %zmm13, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm10 & zmm13) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,3,0,0,5,0,2,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7],ymm7[8,9],ymm6[10],ymm7[11],ymm6[12],ymm7[13,14],ymm6[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,6,0,5,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,4,7,0,2,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3,4,5,6,7],ymm10[8],ymm14[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm5[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,6,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8) +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10],ymm9[11],ymm11[12,13],ymm9[14],ymm11[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -5860,165 +5884,147 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-LABEL: load_i16_stride5_vf64: ; AVX: # %bb.0: ; AVX-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 288(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,1,0,3] +; AVX-NEXT: vmovdqa 192(%rdi), %xmm10 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6,7] +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 160(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa 256(%rdi), %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 272(%rdi), %xmm15 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] -; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 240(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX-NEXT: vmovdqa 192(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa 176(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] -; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vmovaps 224(%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 576(%rdi), %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] +; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX-NEXT: vmovdqa 224(%rdi), %xmm6 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,0,1] +; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 256(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 592(%rdi), %xmm12 -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX-NEXT: vmovdqa 272(%rdi), %xmm12 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm12[1] ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 560(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 240(%rdi), %xmm11 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2 +; AVX-NEXT: vorps %ymm2, %ymm1, %ymm2 +; AVX-NEXT: vmovdqa 304(%rdi), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 288(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 624(%rdi), %xmm3 -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] ; AVX-NEXT: vmovdqa 496(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 480(%rdi), %xmm9 -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] -; AVX-NEXT: vmovdqa 512(%rdi), %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] -; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vmovaps 544(%rdi), %xmm11 -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,1,0,1] -; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 96(%rdi), %xmm10 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] +; AVX-NEXT: vmovdqa 512(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vmovdqa 576(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 144(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] -; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX-NEXT: vmovdqa 592(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX-NEXT: vmovdqa 560(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX-NEXT: vmovdqa 544(%rdi), %xmm9 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm4 +; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX-NEXT: vmovdqa 624(%rdi), %xmm4 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] -; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 -; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 +; AVX-NEXT: vmovdqa 608(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] -; AVX-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; AVX-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vmovdqa 96(%rdi), %xmm8 +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,1,3] +; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] -; AVX-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,0,1] +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm4 +; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm4 +; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm4 +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 128(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa 320(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 336(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX-NEXT: vmovdqa 368(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] @@ -6026,240 +6032,267 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] +; AVX-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; AVX-NEXT: vmovdqa 400(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2 ; AVX-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] -; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX-NEXT: vandnps %ymm3, %ymm13, %ymm3 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 +; AVX-NEXT: vmovdqa 464(%rdi), %xmm3 +; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 448(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3,4],xmm1[5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] ; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX-NEXT: # xmm1 = mem[0,3,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm15 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm2 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm14[2,3],xmm10[4,5],xmm14[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm10[2,3],xmm12[4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1 +; AVX-NEXT: vpsllq $48, %xmm6, %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm15 +; AVX-NEXT: vandnps %ymm15, %ymm13, %ymm15 +; AVX-NEXT: vorps %ymm1, %ymm15, %ymm15 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm6[4,5],xmm12[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpsllq $48, %xmm4, %xmm15 -; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm4, %xmm15 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm3, %xmm5 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm3[0,1],mem[2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 +; AVX-NEXT: vpsllq $48, %xmm9, %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm15 +; AVX-NEXT: vandnps %ymm15, %ymm13, %ymm15 +; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7] +; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm9[0,1,2,3],mem[4,5],xmm9[6,7] ; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vpsllq $48, %xmm11, %xmm15 -; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX-NEXT: vorps %ymm3, %ymm15, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm15[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] -; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm10, %xmm15 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm3, %xmm5 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm3[0,1],mem[2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4,5,6,7] +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1],xmm8[2,3],mem[4,5,6,7] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,0,4,5,6,7] +; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5] +; AVX-NEXT: vpshufd $236, (%rsp), %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX-NEXT: vpsllq $48, %xmm7, %xmm15 -; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15 -; AVX-NEXT: vorps %ymm3, %ymm15, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm15 +; AVX-NEXT: vandnps %ymm15, %ymm13, %ymm15 +; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm15[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX-NEXT: # xmm0 = mem[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,2,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm3, %xmm5 +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm3[0,1],mem[2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,1],xmm2[2,3],mem[4,5,6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[0,3,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,3,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm0, %ymm13, %ymm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm3, %xmm3 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpsllq $48, %xmm2, %xmm2 -; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vpsllq $48, %xmm3, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm5 +; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5 +; AVX-NEXT: vorps %ymm5, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm10[4,5],mem[6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[3,1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm12[2,3],xmm6[4,5,6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] -; AVX-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[0,1,2,0] -; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] ; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,1],xmm14[2,3],mem[4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3,4,5],xmm6[6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,2,0] +; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm12[0,1,2,3],mem[4,5],xmm12[6,7] +; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[0,1],xmm9[2,3],mem[4,5,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm9[0,1],mem[2,3],xmm9[4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3,4,5],xmm6[6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,2,0] +; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7] +; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,1],xmm7[2,3],mem[4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3,4,5],xmm6[6,7] ; AVX-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX-NEXT: # xmm15 = mem[0,1,2,0] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm9[2,3],xmm3[4,5,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0] -; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,0] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6272,100 +6305,100 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpsrlq $48, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] -; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload ; AVX-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[0,1,0,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3,4,5],xmm6[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1],xmm12[2,3],mem[4,5],xmm12[6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2,3],xmm12[4,5],xmm14[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm13[2,3],xmm10[4,5,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3,4,5],xmm6[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 -; AVX-NEXT: vpsrlq $48, %xmm8, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX-NEXT: vmovdqa %xmm0, %xmm4 -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm0, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7] +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[0,1],xmm8[2,3],mem[4,5,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX-NEXT: vmovdqa %xmm2, %xmm8 +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm7[4,5],mem[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload -; AVX-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] -; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3,4,5],xmm6[6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpsrlq $48, %xmm7, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm9[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload +; AVX-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpsrlq $48, %xmm5, %xmm5 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] +; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,1],xmm3[2,3],mem[4,5,6,7] +; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm1 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm9[4,5],xmm5[6,7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -6385,114 +6418,116 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: # xmm2 = mem[2,3,2,3] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] -; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX-NEXT: # xmm2 = mem[1,1,1,1] -; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX-NEXT: # xmm1 = mem[1,1,1,1] +; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload +; AVX-NEXT: # xmm2 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] -; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm6 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3,4,5],xmm6[6,7] +; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,1,2,3] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[0,3,2,3] -; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[0,3,2,3] +; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = xmm6[0,1,2],mem[3],xmm6[4,5,6,7] ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload ; AVX-NEXT: # xmm15 = mem[2,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] -; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX-NEXT: # xmm15 = mem[1,1,1,1] -; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload -; AVX-NEXT: # xmm12 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3] -; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 -; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX-NEXT: # xmm5 = mem[3,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] -; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX-NEXT: # xmm13 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload -; AVX-NEXT: # xmm13 = mem[0,3,2,3] -; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload -; AVX-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7] -; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX-NEXT: # xmm10 = mem[2,3,2,3] -; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1] +; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload -; AVX-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] -; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX-NEXT: # xmm4 = mem[1,1,1,1] -; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload -; AVX-NEXT: # xmm7 = mem[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] -; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm15, %xmm15 +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5],xmm15[6,7] +; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX-NEXT: # xmm15 = mem[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm15[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = mem[0,3,2,3] +; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3],xmm8[4,5,6,7] +; AVX-NEXT: vpshufd $238, (%rsp), %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = mem[2,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3],xmm6[4,5,6,7] +; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX-NEXT: # xmm8 = mem[1,1,1,1] +; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm11[0],xmm8[0],xmm11[1],xmm8[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm11 # 16-byte Folded Reload +; AVX-NEXT: # xmm11 = mem[0,1,2,3],xmm3[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm11 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3,4,5],xmm11[6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,1,1,3] +; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm11[6,7] +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm8 +; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX-NEXT: # xmm3 = mem[3,1,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] +; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = mem[0,3,2,3] +; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3],xmm4[4,5,6,7] +; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[2,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3,4,5,6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] +; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] +; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm6 # 16-byte Folded Reload +; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm5[4,5],mem[6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] ; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX-NEXT: # xmm4 = mem[0,1,1,3] ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm3, 64(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, (%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, (%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 96(%rsi) +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm3, 64(%rdx) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -6518,7 +6553,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm3, 32(%r8) ; AVX-NEXT: vmovaps %ymm2, 64(%r9) -; AVX-NEXT: vmovaps %ymm12, (%r9) +; AVX-NEXT: vmovaps %ymm8, (%r9) ; AVX-NEXT: vmovaps %ymm0, 96(%r9) ; AVX-NEXT: vmovaps %ymm1, 32(%r9) ; AVX-NEXT: addq $1032, %rsp # imm = 0x408 @@ -7902,35 +7937,32 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; ; AVX512-LABEL: load_i16_stride5_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512-NEXT: vmovdqa 416(%rdi), %ymm11 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] -; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15] +; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8 -; AVX512-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm19 -; AVX512-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm12 -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -7940,62 +7972,111 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512-NEXT: vmovdqa64 160(%rdi), %xmm16 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] -; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] -; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-NEXT: vporq %ymm0, %ymm2, %ymm25 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512-NEXT: vmovdqa %ymm13, %ymm11 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[0,1,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512-NEXT: vmovdqa %xmm15, %xmm12 +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm14[1],xmm2[2,3] +; AVX512-NEXT: vmovdqa %xmm2, %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm28 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] -; AVX512-NEXT: vmovdqa64 %ymm15, %ymm18 -; AVX512-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX512-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & zmm20) | zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm22 +; AVX512-NEXT: vmovdqa64 %ymm10, %ymm21 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3] -; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16 -; AVX512-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm27 +; AVX512-NEXT: vmovdqa64 %xmm12, %xmm19 +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %ymm2, %ymm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] +; AVX512-NEXT: vmovdqa %xmm14, %xmm10 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm14[2],xmm15[3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] +; AVX512-NEXT: vmovdqa %ymm6, %ymm5 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) @@ -8006,853 +8087,777 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX512-NEXT: vmovdqa 464(%rdi), %xmm14 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512-NEXT: vmovdqa 448(%rdi), %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX512-NEXT: vmovdqa 448(%rdi), %xmm12 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa %xmm11, %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovdqa %xmm15, %xmm6 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX512-NEXT: vmovdqa %xmm10, %xmm7 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX512-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX512-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX512-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] -; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm21 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa 480(%rdi), %xmm7 -; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm24 -; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm18 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm18[3,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm26 +; AVX512-NEXT: vmovdqa 576(%rdi), %ymm15 +; AVX512-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm15[2],ymm11[3],ymm15[4],ymm11[5,6],ymm15[7],ymm11[8,9],ymm15[10],ymm11[11],ymm15[12],ymm11[13,14],ymm15[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4],xmm0[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm26, %zmm13 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3] -; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25 -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512-NEXT: vpandnq %zmm13, %zmm20, %zmm2 +; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = zmm2 | (zmm20 & mem) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm29 +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5,6,7] +; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm26 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm19[3,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm27[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3] -; AVX512-NEXT: vmovdqa64 %xmm7, %xmm30 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] -; AVX512-NEXT: vpsrlq $48, %xmm21, %xmm13 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512-NEXT: vpshufb %ymm13, %ymm10, %ymm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] -; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload -; AVX512-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3] -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm20 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512-NEXT: vpsrlq $48, %xmm16, %xmm11 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15] -; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] -; AVX512-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28)) -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm28 -; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2)) -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm8 -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm14 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] -; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm26 -; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7] -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX512-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] +; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512-NEXT: vpandnq %zmm2, %zmm20, %zmm1 +; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = zmm1 | (zmm20 & mem) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5],ymm3[6],ymm13[7,8],ymm3[9],ymm13[10,11],ymm3[12],ymm13[13],ymm3[14],ymm13[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[0,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX512-NEXT: vpsrlq $48, %xmm19, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm6[2],xmm7[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm23 +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7] -; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] +; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] -; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0)) -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm13 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7] -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15 -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa %ymm8, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512-NEXT: vmovdqa %xmm5, %xmm6 -; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm11 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0)) -; AVX512-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = (zmm25 & zmm20) | zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm25 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512-NEXT: vpsrlq $48, %xmm18, %xmm2 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vmovdqa %xmm14, %xmm3 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm14[2],xmm12[3] +; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4],ymm15[5],ymm11[6,7],ymm15[8],ymm11[9,10],ymm15[11],ymm11[12],ymm15[13],ymm11[14,15] +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3] +; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = zmm1 | (zmm20 & mem) +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm19 +; AVX512-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm11 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm12[1],xmm3[2,3] +; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512-NEXT: vpandnq %zmm2, %zmm20, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & zmm20) +; AVX512-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm18 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm13[1],ymm10[2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3],ymm7[4],ymm11[5,6],ymm7[7],ymm11[8,9],ymm7[10],ymm11[11],ymm7[12],ymm11[13,14],ymm7[15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm9[1],xmm6[2,3] +; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2],xmm12[3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm8, %ymm9 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm25 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm11 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm8 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3],ymm13[4],ymm10[5,6],ymm13[7],ymm10[8,9],ymm13[10],ymm10[11],ymm13[12],ymm10[13,14],ymm13[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1,2],ymm7[3],ymm11[4],ymm7[5],ymm11[6,7],ymm7[8],ymm11[9,10],ymm7[11],ymm11[12],ymm7[13],ymm11[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm9[2],xmm6[3] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: movb $7, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7] +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] -; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2,3,4,5,6,7],ymm3[8],ymm4[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4,5,6,7] +; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm4 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX512-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX512-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] -; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7] +; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm2 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm28, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rsi) +; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512-NEXT: vmovdqa64 %zmm25, (%rdx) +; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rcx) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, (%rdx) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512-NEXT: vmovaps %zmm2, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r8) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm2, (%r8) ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i16_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX512-FCP-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] +; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4],ymm8[5],ymm15[6,7],ymm8[8],ymm15[9,10],ymm8[11],ymm15[12],ymm8[13],ymm15[14,15] +; AVX512-FCP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,4,6,1,3] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm1, %ymm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] -; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] -; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,0,0,4,7,1,6] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5],ymm8[6],ymm15[7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13],ymm8[14],ymm15[15] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm11[1],xmm5[2,3] +; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,6,0,5,0,0,0] +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1],ymm14[2],ymm10[3],ymm14[4],ymm10[5,6],ymm14[7],ymm10[8,9],ymm14[10],ymm10[11],ymm14[12],ymm10[13,14],ymm14[15] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,4,7,0,2,4,7,0] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm5, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm23 +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm11[2],xmm12[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm17 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,6,3,6,0,0,0] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4],ymm14[5],ymm10[6,7],ymm14[8],ymm10[9,10],ymm14[11],ymm10[12],ymm14[13],ymm10[14,15] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm31 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [2,4,7,1,4,6,0,0] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,9,3,2,4,5,7,6] +; AVX512-FCP-NEXT: vpermt2d %ymm21, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,3,1,3,0,3,5,7] +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm29, %ymm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm30 +; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8,9],ymm7[10],ymm6[11],ymm7[12],ymm6[13,14],ymm7[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4],xmm0[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandnq %zmm30, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm16 & zmm21) ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm10 -; AVX512-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] -; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] -; AVX512-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13],ymm14[14],ymm10[15] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm19, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512-FCP-NEXT: vpermt2d %ymm22, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm29, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1],ymm3[2],ymm14[3],ymm3[4],ymm14[5,6],ymm3[7],ymm14[8,9],ymm3[10],ymm14[11],ymm3[12],ymm14[13,14],ymm3[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4],xmm4[5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512-FCP-NEXT: vpandnq %zmm1, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm1 | (zmm21 & mem) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4 -; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm17, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,2,5,7,4,7,0,0] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512-FCP-NEXT: vpermd %ymm4, %ymm29, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,2,3,1,3,6,7] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm4, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm15 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [1,4,6,3,1,4,6,3] +; AVX512-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm14[1,2],ymm3[3],ymm14[4],ymm3[5],ymm14[6,7],ymm3[8],ymm14[9,10],ymm3[11],ymm14[12],ymm3[13],ymm14[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm22, %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512-FCP-NEXT: vpandnq %zmm15, %zmm21, %zmm10 +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm10 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm10 = zmm10 | (zmm21 & mem) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm29, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm30 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm20, %xmm10 +; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm0[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm22, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vpandnq %zmm4, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm1 | (zmm21 & mem) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7],ymm7[8,9],ymm0[10],ymm7[11],ymm0[12],ymm7[13,14],ymm0[15] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm14 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,2,0,0,5,7,2,4] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm17[2],xmm11[3],xmm17[3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10],ymm12[11],ymm2[12,13],ymm12[14],ymm2[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,3,5,2,5,7,0,0] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm29, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm6[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,4,6,0,1,4,6,0] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm28, %ymm11, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vpandnq %zmm8, %zmm21, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm0 & zmm21) +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10,11],ymm14[12],ymm2[13],ymm14[14],ymm2[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4],xmm13[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4],xmm6[5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm3[1],ymm12[2,3],ymm3[4],ymm12[5],ymm3[6],ymm12[7,8],ymm3[9],ymm12[10,11],ymm3[12],ymm12[13],ymm3[14],ymm12[15] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm22, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm29, %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm30[2],xmm4[3],xmm30[3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm11, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpandnq %zmm6, %zmm21, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm1 & zmm21) +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm7[1,2],ymm1[3],ymm7[4],ymm1[5],ymm7[6,7],ymm1[8],ymm7[9,10],ymm1[11],ymm7[12],ymm1[13],ymm7[14,15] ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] -; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 -; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] -; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,5,0,2,7] +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm29 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm29 & (zmm27 ^ zmm4)) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13],ymm2[14],ymm14[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm27, %ymm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm21 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4],ymm15[5],ymm0[6,7],ymm15[8],ymm0[9,10],ymm15[11],ymm0[12],ymm15[13],ymm0[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0],xmm4[1],xmm13[2],xmm4[3] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm27 +; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5],ymm0[6,7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0] +; AVX512-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512-FCP-NEXT: vpermd %ymm23, %ymm2, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm25, %ymm9 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm12 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm5 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm24, %ymm3 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,0,6,0,3,5] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4],xmm13[5,6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] ; AVX512-FCP-NEXT: movb $7, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15] -; AVX512-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k1} +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm31, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7,8],ymm14[9],ymm1[10],ymm14[11],ymm1[12,13],ymm14[14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6,7],ymm9[8],ymm2[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5],ymm15[6],ymm1[7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13],ymm15[14],ymm1[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3,4],xmm12[5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1],ymm1[2],ymm6[3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11],ymm1[12],ymm6[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6],xmm7[7] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm5 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, (%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rsi) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rdx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, (%rdx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2,3,4,5,6,7],ymm5[8],ymm3[9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vmovaps %zmm3, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512-FCP-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i16_stride5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: subq $456, %rsp # imm = 0x1C8 +; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm9 +; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15] +; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] +; AVX512DQ-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm19 -; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm15 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm12 -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm14 +; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 @@ -8862,62 +8867,111 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm12 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13 +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %xmm16 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] -; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vporq %ymm0, %ymm2, %ymm25 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm2 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm10 +; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm13 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[0,1,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] +; AVX512DQ-NEXT: vmovdqa %xmm15, %xmm12 +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm15[2],xmm1[3],xmm15[3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm14 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm14[1],xmm2[2,3] +; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm28 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm18 -; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] +; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm9 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13],ymm4[14],ymm6[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm24 +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & zmm20) | zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm22 +; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm21 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16 -; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm30 -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm13[1],xmm12[2,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm27 +; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm19 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm7 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] +; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm14[2],xmm15[3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm5 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) @@ -8928,819 +8982,746 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm14 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] +; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa %xmm11, %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512DQ-NEXT: vmovdqa %xmm15, %xmm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm9 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,2,2,3] +; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm7 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 -; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] -; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm5 -; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm21 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm24 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm18 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm18[3,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm26 +; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm15 +; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm11 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm15[2],ymm11[3],ymm15[4],ymm11[5,6],ymm15[7],ymm11[8,9],ymm15[10],ymm11[11],ymm15[12],ymm11[13,14],ymm15[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4],xmm0[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm26, %zmm13 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25 -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpandnq %zmm13, %zmm20, %zmm2 +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = zmm2 | (zmm20 & mem) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm29 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm13 +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm26 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm19[3,1,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm27[0,2,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0],xmm6[1],xmm7[2,3] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm22 -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm30 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm13 -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm10, %ymm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm9 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] -; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm20 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm1 -; AVX512DQ-NEXT: vpsrlq $48, %xmm16, %xmm11 -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19 -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm23 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm28 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8 -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm14 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm5 -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21 -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm26 -; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm31 +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm10 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm20, %zmm1 +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = zmm1 | (zmm20 & mem) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5],ymm3[6],ymm13[7,8],ymm3[9],ymm13[10,11],ymm3[12],ymm13[13],ymm3[14],ymm13[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm27[0,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpsrlq $48, %xmm19, %xmm2 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm6[2],xmm7[3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm23 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm24 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-NEXT: vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm0 -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX512DQ-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] -; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7] -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm14 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm11 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512DQ-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = (zmm25 & zmm20) | zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm25 +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm5 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm4 +; AVX512DQ-NEXT: vpsrlq $48, %xmm18, %xmm2 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm3 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm14[2],xmm12[3] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm11[1,2],ymm15[3],ymm11[4],ymm15[5],ymm11[6,7],ymm15[8],ymm11[9,10],ymm15[11],ymm11[12],ymm15[13],ymm11[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm18 +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX512DQ-NEXT: vpandnq %zmm1, %zmm20, %zmm1 +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = zmm1 | (zmm20 & mem) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm19 +; AVX512DQ-NEXT: vmovdqu (%rsp), %ymm14 # 32-byte Reload +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm11 +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm12[1],xmm3[2,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm20, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & zmm20) +; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm18 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm13[1],ymm10[2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7,8],ymm13[9],ymm10[10],ymm13[11],ymm10[12,13],ymm13[14],ymm10[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3],ymm7[4],ymm11[5,6],ymm7[7],ymm11[8,9],ymm7[10],ymm11[11],ymm7[12],ymm11[13,14],ymm7[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm9[1],xmm6[2,3] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2],xmm12[3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm8, %ymm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm25 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm11 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3],ymm13[4],ymm10[5,6],ymm13[7],ymm10[8,9],ymm13[10],ymm10[11],ymm13[12],ymm10[13,14],ymm13[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1,2],ymm7[3],ymm11[4],ymm7[5],ymm11[6,7],ymm7[8],ymm11[9,10],ymm7[11],ymm11[12],ymm7[13],ymm11[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm7 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2],ymm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm9[2],xmm6[3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512DQ-NEXT: movb $7, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7] +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2,3,4,5,6,7],ymm3[8],ymm4[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload +; AVX512DQ-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm4 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7],ymm4[8,9],mem[10],ymm4[11],mem[12],ymm4[13,14],mem[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5],ymm5[6],mem[7,8],ymm5[9],mem[10,11],ymm5[12],mem[13],ymm5[14],mem[15] ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] -; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] +; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%rdx) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rcx) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) -; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r8) +; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r8) ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovaps %zmm2, (%r8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512DQ-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512DQ-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: subq $520, %rsp # imm = 0x208 +; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5],ymm14[6],ymm12[7,8],ymm14[9],ymm12[10,11],ymm14[12],ymm12[13],ymm14[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4],ymm8[5],ymm15[6,7],ymm8[8],ymm15[9,10],ymm8[11],ymm15[12],ymm8[13],ymm15[14,15] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,4,6,1,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm1, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm21 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] -; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,0,0,0,4,7,1,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5],ymm8[6],ymm15[7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13],ymm8[14],ymm15[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0],xmm11[1],xmm5[2,3] +; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,6,0,5,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1],ymm14[2],ymm10[3],ymm14[4],ymm10[5,6],ymm14[7],ymm10[8,9],ymm14[10],ymm10[11],ymm14[12],ymm10[13,14],ymm14[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,4,7,0,2,4,7,0] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm5, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm11[2],xmm12[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm17 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,6,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm10[1,2],ymm14[3],ymm10[4],ymm14[5],ymm10[6,7],ymm14[8],ymm10[9,10],ymm14[11],ymm10[12],ymm14[13],ymm10[14,15] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm31 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [2,4,7,1,4,6,0,0] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,9,3,2,4,5,7,6] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm21, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,3,1,3,0,3,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm29, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8,9],ymm7[10],ymm6[11],ymm7[12],ymm6[13,14],ymm7[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3,4],xmm0[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandnq %zmm30, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm16 & zmm21) ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm10 -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] -; AVX512DQ-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5],ymm14[6],ymm10[7,8],ymm14[9],ymm10[10,11],ymm14[12],ymm10[13],ymm14[14],ymm10[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm19, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm22, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm29, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1],ymm3[2],ymm14[3],ymm3[4],ymm14[5,6],ymm3[7],ymm14[8,9],ymm3[10],ymm14[11],ymm3[12],ymm14[13,14],ymm3[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3,4],xmm4[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] +; AVX512DQ-FCP-NEXT: vpandnq %zmm1, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 | (zmm21 & mem) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm17, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,2,5,7,4,7,0,0] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16 +; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm29, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,2,3,1,3,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm4, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm15 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [1,4,6,3,1,4,6,3] +; AVX512DQ-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm14[1,2],ymm3[3],ymm14[4],ymm3[5],ymm14[6,7],ymm3[8],ymm14[9,10],ymm3[11],ymm14[12],ymm3[13],ymm14[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm14 +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm22, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX512DQ-FCP-NEXT: vpandnq %zmm15, %zmm21, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm10 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm10 = zmm10 | (zmm21 & mem) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm24 +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm30 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm20, %xmm10 +; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm0[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm22, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vpandnq %zmm4, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 | (zmm21 & mem) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3],ymm0[4],ymm7[5,6],ymm0[7],ymm7[8,9],ymm0[10],ymm7[11],ymm0[12],ymm7[13,14],ymm0[15] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,2,0,0,5,7,2,4] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm17[2],xmm11[3],xmm17[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2],ymm12[3],ymm2[4,5],ymm12[6],ymm2[7,8],ymm12[9],ymm2[10],ymm12[11],ymm2[12,13],ymm12[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,3,5,2,5,7,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm29, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm6[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [1,4,6,0,1,4,6,0] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm11, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vpandnq %zmm8, %zmm21, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm0 & zmm21) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm14[1],ymm2[2,3],ymm14[4],ymm2[5],ymm14[6],ymm2[7,8],ymm14[9],ymm2[10,11],ymm14[12],ymm2[13],ymm14[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4],xmm13[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm15[2],ymm0[3],ymm15[4],ymm0[5,6],ymm15[7],ymm0[8,9],ymm15[10],ymm0[11],ymm15[12],ymm0[13,14],ymm15[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3,4],xmm6[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm3[1],ymm12[2,3],ymm3[4],ymm12[5],ymm3[6],ymm12[7,8],ymm3[9],ymm12[10,11],ymm3[12],ymm12[13],ymm3[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm22, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8],ymm8[9],ymm6[10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm29, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm30[2],xmm4[3],xmm30[3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm11, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpandnq %zmm6, %zmm21, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm1 & zmm21) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3,4],xmm8[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm6, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm7[1,2],ymm1[3],ymm7[4],ymm1[5],ymm7[6,7],ymm1[8],ymm7[9,10],ymm1[11],ymm7[12],ymm1[13],ymm7[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] -; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 -; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] -; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,5,0,2,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm5 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10 +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm29 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm29 & (zmm27 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm2[1],ymm14[2,3],ymm2[4],ymm14[5],ymm2[6],ymm14[7,8],ymm2[9],ymm14[10,11],ymm2[12],ymm14[13],ymm2[14],ymm14[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm27, %ymm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm21 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4],ymm15[5],ymm0[6,7],ymm15[8],ymm0[9,10],ymm15[11],ymm0[12],ymm15[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0],xmm4[1],xmm13[2],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm12[1],ymm3[2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7,8],ymm12[9],ymm3[10],ymm12[11],ymm3[12,13],ymm12[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm6 +; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm0[2],xmm2[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4],ymm2[5],ymm0[6,7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12],ymm2[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm8[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,1,3,0,2,5,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm2, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm7[1],ymm0[2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7,8],ymm7[9],ymm0[10,11],ymm7[12],ymm0[13],ymm7[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm3, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm25, %ymm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm25 -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm12 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm5 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm24, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0,1],ymm10[2],ymm5[3],ymm10[4],ymm5[5,6],ymm10[7],ymm5[8,9],ymm10[10],ymm5[11],ymm10[12],ymm5[13,14],ymm10[15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,0,6,0,3,5] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4],xmm13[5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7] ; AVX512DQ-FCP-NEXT: movb $7, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15] -; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm31 {%k1} +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm31, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7,8],ymm14[9],ymm1[10],ymm14[11],ymm1[12,13],ymm14[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6,7],ymm9[8],ymm2[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5],ymm15[6],ymm1[7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13],ymm15[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3,4],xmm12[5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm6[0,1],ymm1[2],ymm6[3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11],ymm1[12],ymm6[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm0, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rsi) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2,3,4,5,6,7],ymm5[8],ymm3[9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rsi) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rdx) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) +; AVX512DQ-FCP-NEXT: addq $520, %rsp # imm = 0x208 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index d1d7cb0a34332..031f0b1a67fbd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -2110,116 +2110,115 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride3_vf64: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm6 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 -; AVX-NEXT: vmovdqa 112(%rdi), %xmm3 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX-NEXT: vmovdqa (%rdi), %xmm10 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 96(%rdi), %xmm13 +; AVX-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm5 -; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm8 -; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm9 -; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] +; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX-NEXT: vpshufb %xmm5, %xmm9, %xmm8 +; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm11 +; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm12 +; AVX-NEXT: vpor %xmm3, %xmm12, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm12 +; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm5 +; AVX-NEXT: vpor %xmm5, %xmm12, %xmm5 ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm12 -; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm12 -; AVX-NEXT: vpor %xmm10, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm12 -; AVX-NEXT: vpor %xmm11, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm7 -; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm12 -; AVX-NEXT: vpor %xmm7, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm12 -; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm6 -; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa 176(%rdi), %xmm12 +; AVX-NEXT: vpor %xmm9, %xmm12, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm12 +; AVX-NEXT: vpor %xmm6, %xmm12, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm9 +; AVX-NEXT: vpor %xmm10, %xmm9, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vmovdqa 112(%rdi), %xmm10 +; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm12 +; AVX-NEXT: vpor %xmm13, %xmm12, %xmm12 +; AVX-NEXT: vmovdqa 176(%rdi), %xmm13 ; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0 -; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm10 -; AVX-NEXT: vpor %xmm0, %xmm10, %xmm1 -; AVX-NEXT: vmovdqa 128(%rdi), %xmm10 -; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm11 -; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 -; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm11 -; AVX-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm15 -; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm3 +; AVX-NEXT: vpor %xmm0, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm0 +; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm9 +; AVX-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm9 +; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm7 +; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm9 +; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm14 +; AVX-NEXT: vpor %xmm9, %xmm14, %xmm9 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm7 -; AVX-NEXT: vpor %xmm7, %xmm13, %xmm7 -; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm11 -; AVX-NEXT: vpor %xmm11, %xmm9, %xmm11 -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm8, %xmm2 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm5, %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm15 -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm14 -; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15 -; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm10 -; AVX-NEXT: vpor %xmm10, %xmm15, %xmm10 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15 -; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX-NEXT: vpor %xmm12, %xmm15, %xmm12 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm6 -; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm6 -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm15 -; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm11 -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm15 -; AVX-NEXT: vpor %xmm15, %xmm11, %xmm11 -; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsi) -; AVX-NEXT: vmovdqa %xmm2, 48(%rsi) -; AVX-NEXT: vmovdqa %xmm11, 32(%rsi) -; AVX-NEXT: vmovdqa %xmm6, 16(%rsi) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) -; AVX-NEXT: vmovdqa %xmm12, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm10, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm14, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm5, (%rcx) -; AVX-NEXT: vmovdqa %xmm8, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm9, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm13, 16(%rcx) +; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm14 +; AVX-NEXT: vpor %xmm14, %xmm11, %xmm14 +; AVX-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm0 +; AVX-NEXT: vpor %xmm0, %xmm8, %xmm15 +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm8 +; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm8 +; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm8 +; AVX-NEXT: vpor %xmm1, %xmm8, %xmm1 +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm5[11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] +; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm10 +; AVX-NEXT: vpor %xmm5, %xmm10, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10] +; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6 +; AVX-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm10 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm13 +; AVX-NEXT: vpor %xmm13, %xmm10, %xmm10 +; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm12 +; AVX-NEXT: vpor %xmm12, %xmm9, %xmm9 +; AVX-NEXT: vmovdqa %xmm9, 48(%rsi) +; AVX-NEXT: vmovdqa %xmm10, 16(%rsi) +; AVX-NEXT: vmovdqa %xmm6, (%rsi) +; AVX-NEXT: vmovdqa %xmm5, 32(%rsi) +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) +; AVX-NEXT: vmovdqa %xmm4, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm11, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm7, (%rcx) +; AVX-NEXT: vmovdqa %xmm8, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: load_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index ac14f55e3f0ed..f6692f835783b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -2692,148 +2692,151 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride5_vf32: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa 144(%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11] -; AVX-NEXT: vmovdqa 128(%rdi), %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm5 -; AVX-NEXT: vmovdqa 112(%rdi), %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7] -; AVX-NEXT: vmovdqa (%rdi), %xmm7 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11 -; AVX-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u] +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,7,12,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm2[u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 +; AVX-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpblendvb %xmm11, %xmm0, %xmm5, %xmm0 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm5, %ymm12, %ymm11 -; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] -; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 -; AVX-NEXT: vorps %ymm14, %ymm11, %ymm11 -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX-NEXT: vpor %xmm11, %xmm14, %xmm11 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vandps %ymm0, %ymm12, %ymm8 +; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11] +; AVX-NEXT: vmovdqa 112(%rdi), %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u],zero,zero,zero,xmm5[3,8,13,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 96(%rdi), %xmm6 +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,4,9,14],zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm7, %xmm10, %xmm7 +; AVX-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm7[2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm7 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm13, %xmm10, %xmm13 +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 +; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9 +; AVX-NEXT: vorps %ymm9, %ymm8, %ymm8 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[1,6,11] +; AVX-NEXT: vmovdqa 128(%rdi), %xmm10 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u] -; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6 -; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 -; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm11[5,6,7] -; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] -; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14 -; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 -; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm14 -; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15 -; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm14 -; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5,6,7] -; AVX-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12 -; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm3[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,8,13,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm2[u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u] +; AVX-NEXT: vpblendvb %xmm11, %xmm13, %xmm14, %xmm11 +; AVX-NEXT: vandps %ymm12, %ymm11, %ymm11 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u],zero,zero,zero,zero,xmm5[4,9,14,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,0,5,10,15],zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13 +; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm15 +; AVX-NEXT: vandnps %ymm15, %ymm12, %ymm15 +; AVX-NEXT: vorps %ymm15, %ymm11, %ymm11 +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[2,7,12] +; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX-NEXT: vpor %xmm15, %xmm8, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2,3,4],xmm8[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u] +; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm15 +; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm13 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5,6,7] +; AVX-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u],zero,zero,zero,xmm5[0,5,10,15,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm13, %xmm15, %xmm13 +; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8 +; AVX-NEXT: vpshufb %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14 +; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12 +; AVX-NEXT: vorps %ymm12, %ymm8, %ymm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[3,8,13] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4],xmm12[5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm12 -; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm13 -; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm8, %ymm12 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u] +; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm13 +; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6,7] ; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u] -; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm13 +; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm14 +; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm13 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7] -; AVX-NEXT: vpor %xmm6, %xmm13, %xmm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm8, %xmm13, %xmm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u],zero,zero,zero,xmm5[1,6,11,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,2,7,12],zero,zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14 ; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] ; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[3,8,13],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14] ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6 +; AVX-NEXT: vandps %ymm15, %ymm8, %ymm8 ; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14 -; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6 -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14] -; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX-NEXT: vorps %ymm14, %ymm8, %ymm8 +; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] +; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX-NEXT: vextractf128 $1, %ymm6, %xmm15 +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm15 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255] ; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm14, %xmm14 -; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm6, %ymm6 +; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8 ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u] -; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] -; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7 -; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] -; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm4 +; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5],xmm3[6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u] +; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] +; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u],zero,zero,zero,xmm5[2,7,12,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[0,5,10,15] +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vpblendvb %xmm11, %xmm2, %xmm3, %xmm3 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%rsi) ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%rdx) ; AVX-NEXT: vmovaps %ymm12, (%rcx) -; AVX-NEXT: vmovaps %ymm6, (%r8) +; AVX-NEXT: vmovaps %ymm8, (%r8) ; AVX-NEXT: vmovaps %ymm0, (%r9) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -5207,382 +5210,391 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX-LABEL: load_i8_stride5_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $488, %rsp # imm = 0x1E8 +; AVX-NEXT: subq $456, %rsp # imm = 0x1C8 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0] ; AVX-NEXT: vmovdqa (%rdi), %xmm8 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm11 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm0 -; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm13 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm0 +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128] ; AVX-NEXT: # xmm4 = mem[0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm0 -; AVX-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm0 +; AVX-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3] ; AVX-NEXT: # xmm5 = mem[0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm6 +; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpor %xmm0, %xmm6, %xmm6 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1 -; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 176(%rdi), %xmm14 -; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 160(%rdi), %xmm13 -; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm14 +; AVX-NEXT: vmovdqa 176(%rdi), %xmm11 +; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 160(%rdi), %xmm15 +; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm3 +; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3 -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm3 +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 192(%rdi), %xmm1 ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm4 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 ; AVX-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm4 -; AVX-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm6 -; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] +; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm3 +; AVX-NEXT: vmovq {{.*#+}} xmm4 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm5 +; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128] +; AVX-NEXT: # xmm5 = mem[0,0] +; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm6 +; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] ; AVX-NEXT: # xmm7 = mem[0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm6 -; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4] +; AVX-NEXT: vpshufb %xmm7, %xmm13, %xmm8 +; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX-NEXT: vpblendvb %xmm0, %xmm3, %xmm6, %xmm3 +; AVX-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb %xmm2, %xmm15, %xmm2 +; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm3 +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] ; AVX-NEXT: # xmm8 = mem[0,0] -; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm9 -; AVX-NEXT: vpor %xmm6, %xmm9, %xmm6 -; AVX-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2 -; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 -; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm4 -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11] -; AVX-NEXT: # xmm11 = mem[0,0] -; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4 -; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm5 +; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm4 +; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm5 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 -; AVX-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0 +; AVX-NEXT: vmovdqa 144(%rdi), %xmm10 +; AVX-NEXT: vpblendvb %xmm0, %xmm2, %xmm4, %xmm0 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] -; AVX-NEXT: # xmm7 = mem[0,0] -; AVX-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX-NEXT: vpshufb %xmm7, %xmm13, %xmm3 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] -; AVX-NEXT: # xmm5 = mem[0,0] +; AVX-NEXT: vpshufb %xmm8, %xmm10, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128] +; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 +; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX-NEXT: vpor %xmm0, %xmm4, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3] +; AVX-NEXT: # xmm4 = mem[0,0] ; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm3 +; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm5 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128] ; AVX-NEXT: # xmm6 = mem[0,0] -; AVX-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 +; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm12 +; AVX-NEXT: vpor %xmm5, %xmm12, %xmm5 +; AVX-NEXT: vpxor %xmm9, %xmm9, %xmm9 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm5[2,3,4,5,6,7] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa 80(%rdi), %xmm1 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm12 -; AVX-NEXT: vpor %xmm3, %xmm12, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa 80(%rdi), %xmm14 -; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm15 -; AVX-NEXT: vpor %xmm15, %xmm12, %xmm12 -; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm15 +; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] +; AVX-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11] -; AVX-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9 -; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm12, %ymm14, %ymm14 +; AVX-NEXT: vandnps %ymm13, %ymm12, %ymm13 +; AVX-NEXT: vorps %ymm13, %ymm14, %ymm13 +; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa 288(%rdi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm9 -; AVX-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX-NEXT: vmovdqa 272(%rdi), %xmm10 -; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa 256(%rdi), %xmm9 -; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm6 -; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7] -; AVX-NEXT: vmovdqa 240(%rdi), %xmm1 -; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX-NEXT: vmovdqa 224(%rdi), %xmm5 -; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vmovdqa 304(%rdi), %xmm5 +; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm8 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2 -; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12] -; AVX-NEXT: # xmm2 = mem[0,0] -; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm0 -; AVX-NEXT: vmovdqa %xmm8, %xmm11 +; AVX-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpor %xmm3, %xmm8, %xmm3 +; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm4 +; AVX-NEXT: vmovdqa 256(%rdi), %xmm0 +; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6 +; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3,4,5,6,7] +; AVX-NEXT: vmovdqa 240(%rdi), %xmm8 +; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm2 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm3[5,6,7] +; AVX-NEXT: vmovdqa 224(%rdi), %xmm9 +; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm0 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12] ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128] ; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm4 -; AVX-NEXT: vpor %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX-NEXT: vmovdqa %xmm7, %xmm12 +; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm8, %xmm8 -; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa %xmm14, %xmm6 -; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm14, %xmm8, %xmm8 -; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa %xmm11, %xmm6 +; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,0,5,10,15],zero,zero,zero,xmm11[u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm11, %xmm11 +; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,6,11,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm14 +; AVX-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3,4],xmm2[5,6,7] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm15 -; AVX-NEXT: vandnps %ymm15, %ymm12, %ymm15 -; AVX-NEXT: vorps %ymm15, %ymm8, %ymm8 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4 -; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm15 +; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm15, %ymm11 +; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm11, %ymm0, %ymm11 +; AVX-NEXT: vorps %ymm11, %ymm15, %ymm11 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[2,7,12] +; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u] -; AVX-NEXT: vmovdqa %xmm9, %xmm8 -; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] -; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm3 -; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3 -; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5] -; AVX-NEXT: # xmm4 = mem[0,0] -; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,zero,xmm3[4,9,14,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,u,0,5,10,15],zero,zero,zero,xmm11[u,u,u,u,u,u] ; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm14 -; AVX-NEXT: vpor %xmm3, %xmm14, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] -; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm12 -; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7] -; AVX-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] -; AVX-NEXT: # xmm9 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm13 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7] -; AVX-NEXT: vpor %xmm0, %xmm12, %xmm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX-NEXT: vandnps %ymm13, %ymm12, %ymm13 -; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1 +; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5,6,7] +; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm3 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload +; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13] +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm2, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,1,6,11],zero,zero,zero,zero,xmm6[u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm6 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm14 +; AVX-NEXT: vpor %xmm6, %xmm14, %xmm6 +; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,1,6,11,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm15 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128] +; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm13 +; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5,6,7] +; AVX-NEXT: vmovq {{.*#+}} xmm15 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5] +; AVX-NEXT: # xmm1 = mem[0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm12 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[3,4,5,6,7] +; AVX-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 +; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0 +; AVX-NEXT: vandnps %ymm6, %ymm4, %ymm6 +; AVX-NEXT: vorps %ymm6, %ymm0, %ymm0 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,u,1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,zero,xmm4[0,5,10,15,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,4,5,6,7,8,9,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm4 -; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm4 -; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3 +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm3 +; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4,5,6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7] -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] -; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm4 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5,6,7] +; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] +; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1 +; AVX-NEXT: vandnps %ymm2, %ymm3, %ymm2 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm7[u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [128,0,5,10,15,0,128,128,128,0,5,10,15,0,128,128] +; AVX-NEXT: # xmm8 = mem[0,0] +; AVX-NEXT: vpshufb %xmm8, %xmm10, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm3 +; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,2,7,12,128,128,128,0,0,2,7,12,128,128] +; AVX-NEXT: # xmm3 = mem[0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm15, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 -; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] -; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3 -; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4 -; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm4 +; AVX-NEXT: vpor %xmm4, %xmm0, %xmm4 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14] +; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm5 +; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255] +; AVX-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX-NEXT: vandnps %ymm5, %ymm9, %ymm5 +; AVX-NEXT: vorps %ymm5, %ymm1, %ymm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[4,9,14] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551615,255] -; AVX-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7] -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u] -; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7] -; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX-NEXT: vpor %xmm1, %xmm10, %xmm10 +; AVX-NEXT: vextractf128 $1, %ymm5, %xmm12 +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551615,255] +; AVX-NEXT: vpblendvb %xmm1, %xmm12, %xmm10, %xmm10 +; AVX-NEXT: vmovdqa %xmm1, %xmm12 +; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm1 +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm15[u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3,4,5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[3,8,13],zero,zero,zero,xmm14[u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5,6,7] +; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u],zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3 +; AVX-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm2 +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2 -; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX-NEXT: vandps %ymm3, %ymm12, %ymm2 -; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm0 +; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm5, %ymm9, %ymm2 +; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX-NEXT: vpmovsxwq {{.*#+}} xmm8 = [18446744073709551615,255] -; AVX-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128] ; AVX-NEXT: # xmm0 = mem[0,0] -; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm3 -; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u] -; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] -; AVX-NEXT: # xmm3 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6 -; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm7, %xmm4, %xmm4 -; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u] -; AVX-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm12, %xmm12 +; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm3 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15] +; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u],zero,zero,zero,xmm4[2,7,12,u,u,u,u,u,u,u] +; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128] +; AVX-NEXT: # xmm4 = mem[0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,zero,zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[1,6,11,u,u,u,u] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm7[3,8,13,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7] ; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7] ; AVX-NEXT: # xmm12 = mem[0,0] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm13 -; AVX-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm13 +; AVX-NEXT: vmovq {{.*#+}} xmm9 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15 +; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7] ; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255] +; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm3, %xmm3 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm13, %xmm15, %xmm15 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7] -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 -; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 +; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3,4,5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm2 +; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u],zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u] +; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,4,5,6,7,8,u,u,u,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3 -; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,9,14],zero,zero,zero,zero,zero,zero,xmm1[u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX-NEXT: vpblendvb %xmm7, %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[1,6,11,u,u,u,u] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4,5],xmm4[6,7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm4 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7] -; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] +; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm4 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, 32(%rsi) @@ -5601,8 +5613,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX-NEXT: vmovaps %ymm1, (%r8) ; AVX-NEXT: vmovaps %ymm0, 32(%r9) -; AVX-NEXT: vmovaps %ymm2, (%r9) -; AVX-NEXT: addq $488, %rsp # imm = 0x1E8 +; AVX-NEXT: vmovaps %ymm3, (%r9) +; AVX-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -6394,821 +6406,857 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512-LABEL: load_i8_stride5_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm26 +; AVX512-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24 ; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512-NEXT: vmovdqa %ymm0, %ymm4 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm4)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm24 ^ ymm25)) +; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm11[4,9,14],zero,zero,zero,xmm11[2,7,12,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,5,10,15],zero,zero,zero,xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u] +; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm14 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm20) | ymm9 +; AVX512-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512-NEXT: vmovdqa 224(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm11 ^ ymm10)) +; AVX512-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (mem & (ymm9 ^ ymm12)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512-NEXT: vpermd %ymm12, %ymm17, %ymm15 +; AVX512-NEXT: vpermd %ymm15, %ymm17, %ymm15 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm9 +; AVX512-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm1 +; AVX512-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm14)) +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 +; AVX512-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm11 ^ ymm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm19 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm3)) +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = ymm22 ^ (ymm16 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm16)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm20) +; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & zmm21) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,7,12],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512-NEXT: vmovdqa %ymm14, %ymm3 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm20) +; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm21) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero +; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[3,8,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,8,13],zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[u,u,u] +; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15,u,u,u] +; AVX512-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & mem) | ymm3 +; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm3 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512-NEXT: vpandnq %zmm5, %zmm3, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm2 & zmm3) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm4 & (ymm6 ^ ymm26)) +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15] +; AVX512-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm10 ^ (ymm14 & (ymm11 ^ ymm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm12 ^ (mem & (ymm14 ^ ymm12)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm15[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3,4,5,6,7] +; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm19 & (ymm2 ^ ymm5)) +; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpermd %ymm6, %ymm17, %ymm6 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm25 ^ ymm24)) +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] +; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512-NEXT: vpandnq %zmm0, %zmm3, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm5 & zmm3) +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: load_i8_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %ymm26 +; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm4)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm24 ^ ymm25)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm11[4,9,14],zero,zero,zero,xmm11[2,7,12,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,5,10,15],zero,zero,zero,xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u] +; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm14 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm20) | ymm9 +; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm11 ^ ymm10)) +; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (mem & (ymm9 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 +; AVX512-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm9 +; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm1 +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm14)) +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm11 ^ ymm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm19 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm3)) +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512-FCP-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm22 ^ (ymm16 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm16)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm20) +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & zmm21) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,7,12],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm20) +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm21) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm20 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[3,8,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,8,13],zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[u,u,u] +; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & mem) | ymm3 +; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm3 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512-FCP-NEXT: vpandnq %zmm5, %zmm3, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm2 & zmm3) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm4 & (ymm6 ^ ymm26)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15] +; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm10 ^ (ymm14 & (ymm11 ^ ymm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm12 ^ (mem & (ymm14 ^ ymm12)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm15[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3,4,5,6,7] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm19 & (ymm2 ^ ymm5)) +; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm17, %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm25 ^ ymm24)) +; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] +; AVX512-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512-FCP-NEXT: vpandnq %zmm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm5 & zmm3) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: load_i8_stride5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %ymm26 +; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm4)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm24 ^ ymm25)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm11[4,9,14],zero,zero,zero,xmm11[2,7,12,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,5,10,15],zero,zero,zero,xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u] +; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm14 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm20) | ymm9 +; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm11 ^ ymm10)) +; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (mem & (ymm9 ^ ymm12)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512DQ-NEXT: vpermd %ymm12, %ymm17, %ymm15 +; AVX512DQ-NEXT: vpermd %ymm15, %ymm17, %ymm15 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm9 +; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm9, %xmm1 +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm14)) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm11 ^ ymm10)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm19 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] -; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = ymm22 ^ (ymm16 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm16)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm20) +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm9, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & zmm21) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,7,12],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm3 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm20) +; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm21) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[3,8,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,8,13],zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[u,u,u] +; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15,u,u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & mem) | ymm3 +; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm3 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512DQ-NEXT: vpandnq %zmm5, %zmm3, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm2 & zmm3) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm4 & (ymm6 ^ ymm26)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15] +; AVX512DQ-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm10 ^ (ymm14 & (ymm11 ^ ymm10)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm12 ^ (mem & (ymm14 ^ ymm12)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm15[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3,4,5,6,7] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm19 & (ymm2 ^ ymm5)) +; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpermd %ymm6, %ymm17, %ymm6 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm25 ^ ymm24)) +; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] +; AVX512DQ-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm3, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm5 & zmm3) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %ymm26 +; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm25 ^ (ymm10 & (ymm24 ^ ymm25)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm11[4,9,14],zero,zero,zero,xmm11[2,7,12,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,5,10,15],zero,zero,zero,xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm26 ^ (ymm11 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm9 ^ (mem & (ymm11 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,ymm11[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm20) | ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm11 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (mem & (ymm9 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15 +; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm6 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (mem & (ymm6 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,ymm6[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ~ymm16) | ymm10 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm11 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[0,5,10,15,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm14, %xmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~ymm16) | ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm19 = [18446744073709551615,18446744073709551615,16777215,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm24 ^ ymm25)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[u,u,u] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[2,7,12],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm14, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm22 ^ (ymm16 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm16[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm16)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm20) +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & zmm21) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm6 ^ (ymm1 & (ymm26 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[1,6,11,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[2,7,12],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm26 ^ ymm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm9 ^ (mem & (ymm1 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (ymm16 & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm22 ^ (ymm0 & (ymm23 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u] -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm20) +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpandnq %zmm2, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm7 & zmm21) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm6 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm26 ^ (ymm10 & (ymm8 ^ ymm26)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm9 ^ (mem & (ymm10 ^ ymm9)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm16 & (ymm3 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm10 ^ ymm11)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (mem & (ymm2 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm15[2,7,12,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[3,8,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm19 & (ymm1 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm22 ^ (ymm5 & (ymm23 ^ ymm22)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm5 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm24 ^ (ymm5 & (ymm25 ^ ymm24)) -; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,8,13],zero,zero,zero,xmm5[1,6,11],zero,zero,zero,zero,xmm5[u,u,u] +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & mem) | ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm3 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615] +; AVX512DQ-FCP-NEXT: vpandnq %zmm5, %zmm3, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm2 & zmm3) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm26 ^ (ymm4 & (ymm6 ^ ymm26)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,3,8,13],zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[0,5,10,15] +; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm10 ^ (ymm14 & (ymm11 ^ ymm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm12 ^ (mem & (ymm14 ^ ymm12)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm15[3,8,13,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm19 & (ymm2 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm17, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm24 ^ (ymm0 & (ymm25 ^ ymm24)) +; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm23 ^ (ymm4 & (ymm22 ^ ymm23)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm4)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpandnq %zmm0, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm5 & zmm3) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rdx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r8) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 7dbff047e4f87..a87c46929ed60 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1670,55 +1670,58 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512-NEXT: vpermd (%rdx), %zmm5, %zmm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm6 & ~zmm9) | zmm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512-NEXT: vpor %ymm3, %ymm9, %ymm3 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] ; AVX512-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512-NEXT: vpermd %ymm10, %ymm5, %ymm5 +; AVX512-NEXT: vpandn %ymm5, %ymm4, %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) ; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX512-NEXT: vprold $16, %xmm0, %xmm8 -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512-NEXT: vprold $16, %xmm0, %xmm6 +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2],xmm10[3,4],xmm6[5],xmm10[6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm1 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512-NEXT: vpandn %ymm2, %ymm8, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm8) ; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1745,55 +1748,58 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512-FCP-NEXT: vpermd (%rdx), %zmm5, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm6 & ~zmm9) | zmm3 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm9, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512-FCP-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vpandn %ymm5, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm8 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2],xmm10[3,4],xmm6[5],xmm10[6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm1 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpandn %ymm2, %ymm8, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm8) ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1820,55 +1826,58 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512DQ-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512DQ-NEXT: vpermd (%rdx), %zmm5, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm6 & ~zmm9) | zmm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpor %ymm3, %ymm9, %ymm3 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512DQ-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512DQ-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQ-NEXT: vpermd %ymm10, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpandn %ymm5, %ymm4, %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512DQ-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2],xmm10[3,4],xmm6[5],xmm10[6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm1 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512DQ-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-NEXT: vpandn %ymm2, %ymm8, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm8) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1895,55 +1904,58 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm4, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] +; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm5, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm6 & ~zmm9) | zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm9, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm12 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm7 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm12[2],xmm10[3,4],xmm12[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm10, %ymm7 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vpandn %ymm5, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem) ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2],xmm10[3,4],xmm6[5],xmm10[6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm1 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2] ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpandn %ymm2, %ymm8, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm8) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -3069,577 +3081,601 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride3_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX512-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512-NEXT: vmovdqa64 32(%rsi), %xmm24 -; AVX512-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-NEXT: vmovdqa64 16(%rdi), %xmm21 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3] -; AVX512-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm14 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] -; AVX512-NEXT: vpermd %ymm3, %ymm16, %ymm3 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpandn %ymm3, %ymm15, %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15) -; AVX512-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX512-NEXT: vprold $16, %xmm10, %xmm11 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] +; AVX512-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa 80(%rsi), %xmm4 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vpandq %zmm21, %zmm0, %zmm3 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) -; AVX512-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa 96(%rsi), %ymm5 -; AVX512-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512-NEXT: vmovdqa 112(%rsi), %xmm12 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3] -; AVX512-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512-NEXT: vpermd %ymm5, %ymm18, %ymm7 -; AVX512-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19) -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa 64(%rsi), %ymm7 -; AVX512-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512-NEXT: vprold $16, %xmm7, %xmm12 +; AVX512-NEXT: vpermd 64(%rdx), %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm19 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & ~zmm19) | zmm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512-NEXT: vprold $16, %xmm10, %xmm14 +; AVX512-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3,4],xmm14[5],xmm1[6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm15, %ymm10, %ymm4 +; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,1,0,2] +; AVX512-NEXT: vpermd %ymm10, %ymm20, %ymm10 +; AVX512-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm11, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm1) | zmm4 +; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm23 +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3] +; AVX512-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX512-NEXT: vpermd %ymm2, %ymm18, %ymm4 +; AVX512-NEXT: vpandnq %ymm4, %ymm21, %ymm4 +; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm1 & zmm16) +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512-NEXT: vprold $16, %xmm2, %xmm4 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512-NEXT: vpermd %ymm14, %ymm16, %ymm6 -; AVX512-NEXT: vpandn %ymm6, %ymm15, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15) -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX512-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] -; AVX512-NEXT: vpermd %ymm8, %ymm18, %ymm6 -; AVX512-NEXT: vpandnq %ymm6, %ymm22, %ymm6 -; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19) -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rcx) +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3] +; AVX512-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512-NEXT: vpermd %ymm3, %ymm20, %ymm3 +; AVX512-NEXT: vpandn %ymm3, %ymm11, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & zmm11) +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm11 +; AVX512-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512-NEXT: vpermd %ymm14, %ymm18, %ymm3 +; AVX512-NEXT: vpandnq %ymm3, %ymm21, %ymm3 +; AVX512-NEXT: vpshufb %ymm15, %ymm14, %ymm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm21, %zmm16, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm1) | zmm3 +; AVX512-NEXT: vprold $16, %xmm8, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-NEXT: vprold $16, %xmm22, %xmm5 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,2,3] +; AVX512-NEXT: vpermd (%rdx), %zmm18, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm1)) +; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rcx) ; AVX512-NEXT: vmovdqa64 %zmm17, 320(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride3_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 -; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] -; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15) -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm11 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpandq %zmm21, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm12 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm18, %ymm7 -; AVX512-FCP-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19) -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm12 +; AVX512-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm19 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & ~zmm19) | zmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm14 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3,4],xmm14[5],xmm1[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm4 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,1,0,2] +; AVX512-FCP-NEXT: vpermd %ymm10, %ymm20, %ymm10 +; AVX512-FCP-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm1) | zmm4 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm4 +; AVX512-FCP-NEXT: vpandnq %ymm4, %ymm21, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm17 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm1 & zmm16) +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm4 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm6 -; AVX512-FCP-NEXT: vpandn %ymm6, %ymm15, %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15) -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm18, %ymm6 -; AVX512-FCP-NEXT: vpandnq %ymm6, %ymm22, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19) -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-FCP-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rcx) +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpermd %ymm3, %ymm20, %ymm3 +; AVX512-FCP-NEXT: vpandn %ymm3, %ymm11, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & zmm11) +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm18, %ymm3 +; AVX512-FCP-NEXT: vpandnq %ymm3, %ymm21, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm16, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm1) | zmm3 +; AVX512-FCP-NEXT: vprold $16, %xmm8, %xmm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vprold $16, %xmm22, %xmm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,2,3] +; AVX512-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rcx) ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 320(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride3_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512DQ-NEXT: vmovdqa64 32(%rsi), %xmm24 -; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-NEXT: vmovdqa64 16(%rdi), %xmm21 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] -; AVX512DQ-NEXT: vpermd %ymm3, %ymm16, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpandn %ymm3, %ymm15, %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15) -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm11 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512DQ-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm4 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vpandq %zmm21, %zmm0, %zmm3 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512DQ-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm5 -; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm12 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3] -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512DQ-NEXT: vpermd %ymm5, %ymm18, %ymm7 -; AVX512DQ-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19) -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm7 -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQ-NEXT: vprold $16, %xmm7, %xmm12 +; AVX512DQ-NEXT: vpermd 64(%rdx), %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & ~zmm19) | zmm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm14 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3,4],xmm14[5],xmm1[6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm10, %ymm4 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,1,0,2] +; AVX512DQ-NEXT: vpermd %ymm10, %ymm20, %ymm10 +; AVX512DQ-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm11, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm1) | zmm4 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX512DQ-NEXT: vpermd %ymm2, %ymm18, %ymm4 +; AVX512DQ-NEXT: vpandnq %ymm4, %ymm21, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm1 & zmm16) +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm4 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512DQ-NEXT: vpermd %ymm14, %ymm16, %ymm6 -; AVX512DQ-NEXT: vpandn %ymm6, %ymm15, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15) -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX512DQ-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512DQ-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-NEXT: vpermd %ymm8, %ymm18, %ymm6 -; AVX512DQ-NEXT: vpandnq %ymm6, %ymm22, %ymm6 -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19) -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rcx) +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512DQ-NEXT: vpermd %ymm3, %ymm20, %ymm3 +; AVX512DQ-NEXT: vpandn %ymm3, %ymm11, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & zmm11) +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm11 +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-NEXT: vpermd %ymm14, %ymm18, %ymm3 +; AVX512DQ-NEXT: vpandnq %ymm3, %ymm21, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm14, %ymm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm21, %zmm16, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm1) | zmm3 +; AVX512DQ-NEXT: vprold $16, %xmm8, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-NEXT: vprold $16, %xmm22, %xmm5 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-NEXT: vpermd (%rdx), %zmm18, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rcx) ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 320(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride3_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2] -; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15) -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm10 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm21, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10] -; AVX512DQ-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm18, %ymm7 -; AVX512DQ-FCP-NEXT: vpandnq %ymm7, %ymm22, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19) -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm7 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm12 +; AVX512DQ-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm21, %zmm19 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & ~zmm19) | zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2],xmm1[3,4],xmm14[5],xmm1[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,1,0,2] +; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm20, %ymm10 +; AVX512DQ-FCP-NEXT: vpandn %ymm10, %ymm11, %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm10, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm1) | zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm4 +; AVX512DQ-FCP-NEXT: vpandnq %ymm4, %ymm21, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm17 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm1 & zmm16) +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm5 -; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm6 -; AVX512DQ-FCP-NEXT: vpandn %ymm6, %ymm15, %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15) -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm18, %ymm6 -; AVX512DQ-FCP-NEXT: vpandnq %ymm6, %ymm22, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rcx) +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm20, %ymm3 +; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm11, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm1 & zmm11) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm18, %ymm3 +; AVX512DQ-FCP-NEXT: vpandnq %ymm3, %ymm21, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm16, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm1) | zmm3 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm8, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm22, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rcx) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 320(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 3311a311c8e46..efc69ceb2fcfd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -443,14 +443,16 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm5 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm4) ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] ; AVX512-NEXT: vmovq %xmm0, 32(%r9) -; AVX512-NEXT: vmovdqa %ymm2, (%r9) +; AVX512-NEXT: vmovdqa %ymm5, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -470,13 +472,15 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm4) ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] ; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9) -; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9) +; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -496,14 +500,16 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vpandn %ymm3, %ymm4, %ymm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm4) ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] ; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9) -; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9) +; AVX512DQ-NEXT: vmovdqa %ymm5, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -523,13 +529,15 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm2 & ymm4) ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -927,11 +935,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,ymm6[u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,22,23] ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm4 & ~zmm5) | zmm6 ; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -941,45 +951,47 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride5_vf8: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,2,10,2,10,0,0] -; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm0, %ymm5 +; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[2,3,6,7,u,u],zero,zero,zero,zero,ymm5[8,9,12,13,u,u],zero,zero,zero,zero,ymm5[18,19,22,23,u,u],zero,zero,zero,zero -; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3],zero,zero,zero,zero,ymm6[u,u,4,5,8,9],zero,zero,zero,zero,ymm6[u,u,18,19,22,23],zero,zero,zero,zero,ymm6[u,u,24,25,28,29] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) -; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm4 & ~zmm5) | zmm6 +; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1011,11 +1023,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,8,9],zero,zero,zero,zero,ymm6[u,u,2,3,10,11],zero,zero,zero,zero,ymm6[u,u,20,21,28,29],zero,zero,zero,zero,ymm6[u,u,22,23] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm4 & ~zmm5) | zmm6 ; AVX512DQ-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1025,45 +1039,47 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf8: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm4 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,9,2,10,2,10,0,0] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm0, %ymm5 +; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[2,3,6,7,u,u],zero,zero,zero,zero,ymm5[8,9,12,13,u,u],zero,zero,zero,zero,ymm5[18,19,22,23,u,u],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,0] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,ymm6[0,1,8,9,u,u],zero,zero,zero,zero,ymm6[2,3,18,19,u,u],zero,zero,zero,zero,ymm6[28,29,20,21,u,u],zero,zero ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,2,6,0,2,6,3,7] ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3],zero,zero,zero,zero,ymm6[u,u,4,5,8,9],zero,zero,zero,zero,ymm6[u,u,18,19,22,23],zero,zero,zero,zero,ymm6[u,u,24,25,28,29] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm4 & ~zmm5) | zmm6 +; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,u,u,u,u,u,u,12,13,14,15,u,u] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] ; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r8), %xmm1 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -1744,10 +1760,13 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] -; AVX512-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpandq %zmm6, %zmm5, %zmm5 +; AVX512-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm4[0,1,1,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = (zmm8 & ~zmm6) | zmm5 ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] @@ -1819,10 +1838,13 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] -; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm6, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[0,1,1,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (zmm8 & ~zmm6) | zmm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] @@ -1893,10 +1915,13 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm4[0,1,1,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (zmm8 & ~zmm6) | zmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] @@ -1968,10 +1993,13 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm7 & (ymm6 ^ ymm5)) ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[0,1,2,3],zmm8[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[0,1,1,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm6, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm4[0,1,1,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = (zmm8 & ~zmm6) | zmm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,16,17,u,u,u,u,u,u,u,u,18,19,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3],ymm5[4],ymm8[5],ymm5[6],ymm8[7,8],ymm5[9],ymm8[10,11],ymm5[12],ymm8[13],ymm5[14],ymm8[15] @@ -3338,614 +3366,624 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride5_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] -; AVX512-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512-NEXT: vmovdqa64 (%rdx), %ymm21 +; AVX512-NEXT: vmovdqa64 32(%rdx), %ymm16 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3,4],ymm4[5,6,7,8],ymm2[9],ymm4[10],ymm2[11,12],ymm4[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-NEXT: vpandnq %zmm0, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm2[0,1,0,1] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm17 & (zmm12 ^ zmm0)) -; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12)) -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10],ymm4[11],ymm13[12,13],ymm4[14],ymm13[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[0,1,0,1] +; AVX512-NEXT: vpandq %zmm19, %zmm2, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm14 & (zmm2 | zmm0) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm0 +; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm23 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (zmm23 & ~zmm0) | zmm2 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm22 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512-NEXT: vprolq $16, %ymm15, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7,8],ymm2[9],ymm13[10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpandnq %zmm0, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm5, %ymm0, %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2],ymm5[3,4],ymm13[5,6,7,8],ymm5[9],ymm13[10],ymm5[11,12],ymm13[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512-NEXT: vpandq %zmm17, %zmm4, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm20 & (zmm4 | zmm2) +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm2 +; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm14 & ~zmm2) | zmm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm16[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13],ymm0[14],ymm12[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm15 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm12 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm11 -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm13 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm12)) -; AVX512-NEXT: vmovdqa (%r8), %ymm10 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512-NEXT: vpshufb %ymm2, %ymm15, %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm22[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512-NEXT: vpshufb %xmm15, %xmm8, %xmm5 +; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5],xmm11[6],xmm12[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4)) +; AVX512-NEXT: vmovdqa (%r8), %ymm11 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] -; AVX512-NEXT: vpandnq %ymm11, %ymm20, %ymm11 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm13 & zmm20) +; AVX512-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX512-NEXT: vpandnq %ymm4, %ymm16, %ymm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & zmm16) +; AVX512-NEXT: vpshufb %xmm15, %xmm9, %xmm5 +; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,0,1],zmm5[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX512-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpbroadcastq (%r8), %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,1,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm14, %ymm3 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm8 -; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3,4],ymm9[5,6,7,8],ymm5[9],ymm9[10],ymm5[11,12],ymm9[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm2 & (zmm5 ^ zmm1)) -; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm5)) -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm6, %ymm8 -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[2,3,2,3] -; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm3[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm17 & (zmm3 ^ zmm4)) -; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm2, %ymm2 -; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm3 & mem) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm11, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm5)) +; AVX512-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm0)) +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm7, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm6[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13],ymm5[14],ymm2[15] +; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm19 & (zmm1 ^ zmm0)) +; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm20, %ymm0 +; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) +; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm14, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm23, 128(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride5_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm19 -; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 32(%rdx), %ymm16 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdx), %ymm18 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] -; AVX512-FCP-NEXT: vpandnq %ymm3, %ymm17, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm12 & zmm17) -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512-FCP-NEXT: vprolq $16, %ymm15, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7,8],ymm10[9],ymm0[10],ymm10[11],ymm0[12,13],ymm10[14],ymm0[15] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm7[2],ymm3[3],ymm7[4],ymm3[5,6],ymm7[7],ymm3[8,9],ymm7[10],ymm3[11],ymm7[12],ymm3[13,14],ymm7[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,3,2,3,10,11,10,10] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm17 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm18[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3,4],ymm7[5,6,7,8],ymm10[9],ymm7[10],ymm10[11,12],ymm7[13,14,15] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,3,2,3,10,10,11,10] +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm13, %zmm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm13 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3,4],ymm0[5,6,7,8],ymm10[9],ymm0[10],ymm10[11,12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm16[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13],ymm3[14],ymm8[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm4[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm2, %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm1 & mem) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vprolq $16, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,3,2,3,10,11,10,10] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3,4],ymm5[5,6,7,8],ymm13[9],ymm5[10],ymm13[11,12],ymm5[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm14 & (zmm13 ^ zmm10)) -; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm13)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,2,3,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,3,2,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm8 ^ (zmm17 & (zmm4 ^ zmm8)) -; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2],xmm9[3],xmm4[4,5],xmm9[6],xmm4[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,1,8,9,8,8] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm14 & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,1,1,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm4[0,1,2,3],zmm3[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5],xmm15[6],xmm4[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,0,1,8,9,8,8] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpandn %ymm5, %ymm6, %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm8, %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & zmm6) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpandnq %zmm17, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpandq %zmm0, %zmm7, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm8 & (zmm6 | zmm1) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm8 +; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm13 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & ~zmm7) | zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-FCP-NEXT: vpandnq %zmm10, %zmm6, %zmm7 +; AVX512-FCP-NEXT: vpandq %zmm6, %zmm12, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm1 & (zmm10 | zmm7) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm12 +; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm13 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & ~zmm1) | zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm6 & (zmm9 ^ zmm16)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm11 ^ (zmm0 & (zmm2 ^ zmm11)) +; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm7 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride5_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm10 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm12 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3,4],ymm2[5,6,7,8],ymm1[9],ymm2[10],ymm1[11,12],ymm2[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm21 +; AVX512DQ-NEXT: vmovdqa64 32(%rdx), %ymm16 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3,4],ymm4[5,6,7,8],ymm2[9],ymm4[10],ymm2[11,12],ymm4[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm8 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7,8],ymm12[9],ymm14[10],ymm12[11],ymm14[12,13],ymm12[14],ymm14[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm2[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm17 & (zmm12 ^ zmm0)) -; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm0 -; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm12)) -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm14 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1],ymm13[2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7,8],ymm4[9],ymm13[10],ymm4[11],ymm13[12,13],ymm4[14],ymm13[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vpandq %zmm19, %zmm2, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm14 & (zmm2 | zmm0) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm14, %zmm0 +; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm4 +; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm23 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (zmm23 & ~zmm0) | zmm2 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm22 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm15 +; AVX512DQ-NEXT: vprolq $16, %ymm15, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm2[1],ymm13[2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7,8],ymm2[9],ymm13[10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm5, %ymm0, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm25 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2],ymm5[3,4],ymm13[5,6,7,8],ymm5[9],ymm13[10],ymm5[11,12],ymm13[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpandq %zmm17, %zmm4, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm20 & (zmm4 | zmm2) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm20, %zmm2 +; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm5 +; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm14 & ~zmm2) | zmm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm16 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm16[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5],ymm0[6],ymm12[7,8],ymm0[9],ymm12[10,11],ymm0[12],ymm12[13],ymm0[14],ymm12[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm15 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm12 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7,8],ymm1[9],ymm13[10],ymm1[11],ymm13[12,13],ymm1[14],ymm13[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5],xmm10[6],xmm11[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm12)) -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm10 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm15, %ymm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm22[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm8, %xmm5 +; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13],ymm0[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5],xmm11[6],xmm12[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4)) +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,1] -; AVX512DQ-NEXT: vpandnq %ymm11, %ymm20, %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm13 & zmm20) +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm4, %ymm13 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] +; AVX512DQ-NEXT: vpandnq %ymm4, %ymm16, %ymm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm5 & zmm16) +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm9, %xmm5 +; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm13 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3],xmm13[4],xmm5[5],xmm13[6],xmm5[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm9[0,1,0,1],zmm5[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm9 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm13 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm2 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm8 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm10[0,1,1,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm14, %ymm3 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8,9],ymm1[10],ymm3[11],ymm1[12],ymm3[13,14],ymm1[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,2,3,3,7,6,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3,4],ymm9[5,6,7,8],ymm5[9],ymm9[10],ymm5[11,12],ymm9[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm2 & (zmm5 ^ zmm1)) -; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm1 -; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm5)) -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm6, %ymm8 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3],ymm5[4],ymm8[5,6],ymm5[7],ymm8[8,9],ymm5[10],ymm8[11],ymm5[12],ymm8[13,14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[2,3,2,3] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[0,1,2,3],zmm3[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm17 & (zmm3 ^ zmm4)) -; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm3 & mem) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 128(%r9) +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6],xmm0[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm5)) +; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm0)) +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm7, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm7, %ymm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm6[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5],ymm2[6],ymm5[7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13],ymm2[14],ymm5[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3],ymm5[4],ymm2[5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10,11],ymm5[12],ymm2[13],ymm5[14],ymm2[15] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm19 & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm20, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 128(%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %xmm18 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm19 -; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdx), %ymm16 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8],ymm3[9],ymm7[10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdx), %ymm18 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2],xmm12[3],xmm7[4,5],xmm12[6],xmm7[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpandnq %ymm3, %ymm17, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm12 & zmm17) -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm15, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13],ymm2[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7,8],ymm10[9],ymm0[10],ymm10[11],ymm0[12,13],ymm10[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1],ymm7[2],ymm3[3],ymm7[4],ymm3[5,6],ymm7[7],ymm3[8,9],ymm7[10],ymm3[11],ymm7[12],ymm3[13,14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [2,3,2,3,10,11,10,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm17 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm18[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3,4],ymm7[5,6,7,8],ymm10[9],ymm7[10],ymm10[11,12],ymm7[13,14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [2,3,2,3,10,10,11,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm13, %zmm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2],ymm10[3,4],ymm0[5,6,7,8],ymm10[9],ymm0[10],ymm10[11,12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm12 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm0[2],ymm4[3],ymm0[4],ymm4[5,6],ymm0[7],ymm4[8,9],ymm0[10],ymm4[11],ymm0[12],ymm4[13,14],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm16[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13],ymm3[14],ymm8[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[0,1,2,3],zmm4[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm4[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm17 & (zmm1 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm2, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm20 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,3,2,3,10,11,10,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2],ymm13[3,4],ymm5[5,6,7,8],ymm13[9],ymm5[10],ymm13[11,12],ymm5[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3],ymm13[4],ymm0[5],ymm13[6],ymm0[7,8],ymm13[9],ymm0[10,11],ymm13[12],ymm0[13],ymm13[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm0, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm14 & (zmm13 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3,4],ymm4[5,6,7,8],ymm5[9],ymm4[10],ymm5[11,12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,2,3,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2],ymm4[3],ymm1[4,5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10],ymm4[11],ymm1[12,13],ymm4[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,3,2,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm8 ^ (zmm17 & (zmm4 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2],xmm9[3],xmm4[4,5],xmm9[6],xmm4[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,1,8,9,8,8] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm14 & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm20[0,1,1,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9) +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7,8],ymm4[9],ymm9[10],ymm4[11],ymm9[12,13],ymm4[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm4[0,1,2,3],zmm3[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5],xmm15[6],xmm4[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,0,1,8,9,8,8] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm5, %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm8, %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & zmm6) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandnq %zmm17, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpandq %zmm0, %zmm7, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm8 & (zmm6 | zmm1) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm8 +; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & ~zmm7) | zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpandnq %zmm10, %zmm6, %zmm7 +; AVX512DQ-FCP-NEXT: vpandq %zmm6, %zmm12, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm1 & (zmm10 | zmm7) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm12 +; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & ~zmm1) | zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm6 & (zmm9 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm11 ^ (zmm0 & (zmm2 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm7 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq @@ -6726,1219 +6764,1295 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride5_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm3 -; AVX512-NEXT: vmovdqa 96(%rdx), %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX512-NEXT: vpbroadcastq 104(%rdi), %xmm6 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm29 -; AVX512-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512-NEXT: vpandn %ymm4, %ymm3, %ymm4 -; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm26 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512-NEXT: vmovdqa64 64(%rdx), %ymm30 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,2,3,3,7,6,7,7] -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512-NEXT: subq $360, %rsp # imm = 0x168 +; AVX512-NEXT: vmovdqa 96(%rdx), %xmm0 +; AVX512-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3,4],ymm4[5,6,7,8],ymm2[9],ymm4[10],ymm2[11,12],ymm4[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm0[0,1,0,1] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4],ymm4[5,6,7,8],ymm3[9],ymm4[10],ymm3[11,12],ymm4[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm30 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm4[0,1,2,3],zmm3[0,1,0,1] +; AVX512-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm4 +; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm22 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 +; AVX512-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10,11],ymm1[12],ymm5[13],ymm1[14],ymm5[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512-NEXT: vpbroadcastq 104(%rdi), %xmm5 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm28 +; AVX512-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX512-NEXT: vpandnq %ymm1, %ymm21, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm0, %ymm2 +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512-NEXT: vpshufb %xmm15, %xmm1, %xmm2 ; AVX512-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm4[0,1,0,1] +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,6] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1] ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512-NEXT: vmovdqa 64(%rcx), %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa64 (%rdi), %ymm26 +; AVX512-NEXT: vpshufb %xmm15, %xmm0, %xmm2 ; AVX512-NEXT: vpbroadcastq 8(%rdi), %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512-NEXT: vmovdqa (%rsi), %ymm5 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm2[0,1,0,1] -; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm5, %ymm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm12, %ymm5, %ymm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm26[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb %ymm7, %ymm10, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[3,2,3,3,7,6,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm1[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10],ymm11[11],ymm4[12,13],ymm11[14],ymm4[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512-NEXT: vmovdqa %xmm8, %xmm7 +; AVX512-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm10[0,1,0,1] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,6] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm10[0,1,0,1] -; AVX512-NEXT: vmovdqa64 32(%rdx), %ymm28 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm10 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8],ymm10[9],ymm3[10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4,5],xmm8[6],xmm5[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm27 -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm31[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] -; AVX512-NEXT: vpshufb %xmm9, %xmm11, %xmm5 -; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm10 -; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm10[0,1,0,1] +; AVX512-NEXT: vmovdqa64 32(%rdx), %ymm19 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512-NEXT: vpshufb %ymm6, %ymm10, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] +; AVX512-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm9 +; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm26 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm26[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13],ymm3[14],ymm8[15] +; AVX512-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-NEXT: vpbroadcastq 40(%rdi), %xmm8 +; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm16 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm1 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm3[0,1,1,1] -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512-NEXT: vpandnq %ymm10, %ymm21, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm22 -; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] +; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512-NEXT: vpandnq %ymm5, %ymm21, %ymm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm6 +; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm15 ; AVX512-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,2,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0],xmm7[1],xmm3[2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm1, %ymm7 -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] -; AVX512-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm3[2,3,2,3] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm11, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] +; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm5 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm30[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm3[2,3,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[0,1,2,1,4,5,6,5] +; AVX512-NEXT: vprolq $16, %ymm2, %ymm5 +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm13 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5],ymm2[6],ymm13[7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13],ymm2[14],ymm13[15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,2,3,3,7,6,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm15, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8,9],ymm2[10],ymm14[11],ymm2[12],ymm14[13,14],ymm2[15] -; AVX512-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm17[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[3,0,3,0,7,4,7,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10],ymm6[11],ymm14[12,13],ymm6[14],ymm14[15] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,2,1,4,5,6,5] -; AVX512-NEXT: vprolq $16, %ymm8, %ymm14 -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3],ymm6[4],ymm14[5,6],ymm6[7],ymm14[8,9],ymm6[10],ymm14[11],ymm6[12],ymm14[13,14],ymm6[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512-NEXT: # ymm14 = mem[0,1,0,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7,8],ymm1[9],ymm8[10],ymm1[11],ymm8[12,13],ymm1[14],ymm8[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,3,2,2] -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,2,5,5,5,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[3,2,3,3,7,6,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2],ymm1[3,4],ymm4[5,6,7,8],ymm1[9],ymm4[10],ymm1[11,12],ymm4[13,14,15] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = mem ^ (zmm28 & (zmm29 ^ mem)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm23 # 64-byte Folded Reload -; AVX512-NEXT: # zmm23 = mem ^ (zmm30 & (zmm23 ^ mem)) -; AVX512-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512-NEXT: vpbroadcastq 96(%r8), %ymm19 -; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm1, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm23)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm24 ^ (zmm30 & (zmm25 ^ zmm24)) -; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm17 -; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm23 -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX512-NEXT: vmovdqa (%r8), %ymm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm19 & (zmm17 ^ zmm25)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm27 ^ (zmm28 & (zmm16 ^ zmm27)) -; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,1,1] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm23, %ymm25 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm5 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512-NEXT: vpandq %zmm5, %zmm20, %zmm11 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 & (zmm11 | zmm8) +; AVX512-NEXT: vpbroadcastq 88(%r8), %ymm8 +; AVX512-NEXT: vpbroadcastq 96(%r8), %ymm16 +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & ~zmm16) | zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm22 ^ (zmm5 & (zmm31 ^ zmm22)) +; AVX512-NEXT: vpbroadcastq 24(%r8), %ymm11 +; AVX512-NEXT: vpbroadcastq 32(%r8), %ymm22 +; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm11, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm16 & (zmm22 ^ zmm31)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm18 ^ (zmm16 & (zmm28 ^ zmm18)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm16 & (zmm1 ^ zmm9)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vmovdqa64 %zmm21, %zmm31 +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm21, %zmm16 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = (zmm16 & zmm1) | zmm6 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa 64(%r8), %ymm8 -; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm8[0,1,1,1] -; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm8 -; AVX512-NEXT: vpandnq 80(%r8){1to4}, %ymm23, %ymm27 -; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 | (zmm29 & zmm1) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm16 & zmm1) -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11 ^ (zmm14 & (zmm11 ^ mem)) -; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm16 -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm16, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm24 & (zmm16 ^ zmm11)) -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq $184, (%rsp), %zmm14, %zmm9 # 64-byte Folded Reload -; AVX512-NEXT: # zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ mem)) -; AVX512-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm24 & (zmm10 ^ zmm9)) -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm9 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm9 ^ (zmm14 & (zmm5 ^ zmm9)) -; AVX512-NEXT: vpbroadcastq 112(%r8), %ymm9 -; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm23 & (zmm9 ^ zmm5)) -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm6, %zmm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm14 & (zmm3 ^ zmm5)) -; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm21 ^ (zmm30 & (zmm18 ^ zmm21)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm20 ^ (zmm30 & (zmm7 ^ zmm20)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm18 & zmm3) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & zmm3) -; AVX512-NEXT: vmovdqa64 %zmm8, 384(%r9) -; AVX512-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512-NEXT: vmovdqa64 %zmm9, 576(%r9) -; AVX512-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm16, 320(%r9) -; AVX512-NEXT: vmovdqa64 %zmm2, 448(%r9) -; AVX512-NEXT: vmovdqa64 %zmm26, 512(%r9) -; AVX512-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,0,3,0,7,4,7,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7,8],ymm0[9],ymm6[10],ymm0[11],ymm6[12,13],ymm0[14],ymm6[15] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[0,1,2,3],zmm3[2,3,2,3] +; AVX512-NEXT: vmovdqa (%r8), %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-NEXT: vpshufb %ymm4, %ymm10, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[1,1,1,2,5,5,5,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7,8],ymm9[9],ymm2[10,11],ymm9[12],ymm2[13],ymm9[14],ymm2[15] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[3,2,3,3,7,6,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3,4],ymm10[5,6,7,8],ymm11[9],ymm10[10],ymm11[11,12],ymm10[13,14,15] +; AVX512-NEXT: vpandnq 16(%r8){1to4}, %ymm9, %ymm11 +; AVX512-NEXT: vmovdqa %ymm14, %ymm3 +; AVX512-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 +; AVX512-NEXT: vmovdqa 64(%r8), %ymm14 +; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm11 +; AVX512-NEXT: vpandnq 80(%r8){1to4}, %ymm9, %ymm26 +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm11, %zmm11 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm28 & zmm31) +; AVX512-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512-NEXT: vpermq $238, (%rsp), %ymm26 # 32-byte Folded Reload +; AVX512-NEXT: # ymm26 = mem[2,3,2,3] +; AVX512-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512-NEXT: # ymm28 = mem[2,3,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,3,2,3] +; AVX512-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512-NEXT: # ymm31 = mem[2,2,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm27[0,1,0,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm23[0,1,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,0] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm26, %zmm26 +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm29, %zmm28 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpandnq %zmm26, %zmm29, %zmm26 +; AVX512-NEXT: vpandq %zmm29, %zmm28, %zmm28 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm9 & (zmm28 | zmm26) +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12 +; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512-NEXT: # zmm12 = zmm12 ^ (zmm29 & (zmm12 ^ mem)) +; AVX512-NEXT: vpbroadcastq 64(%r8), %ymm15 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm30 & (zmm14 ^ zmm12)) +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = zmm4 ^ (zmm29 & (zmm4 ^ mem)) +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm30, %zmm12 +; AVX512-NEXT: vpbroadcastq (%r8), %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm12 & (zmm1 ^ zmm4)) +; AVX512-NEXT: vpbroadcastq 112(%r8), %ymm4 +; AVX512-NEXT: vpbroadcastq 120(%r8), %ymm12 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & ~zmm12) | zmm28 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm29 & (zmm2 ^ zmm0)) +; AVX512-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX512-NEXT: vpbroadcastq 56(%r8), %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm12 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm25 # 64-byte Folded Reload +; AVX512-NEXT: # zmm25 = mem ^ (zmm5 & (zmm25 ^ mem)) +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm18, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & zmm25) | zmm19 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm24 ^ (zmm5 & (zmm6 ^ zmm24)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm6 & zmm18) +; AVX512-NEXT: vmovdqa64 %zmm11, 384(%r9) +; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm16, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm22, 128(%r9) +; AVX512-NEXT: vmovdqa64 %zmm14, 320(%r9) +; AVX512-NEXT: vmovdqa64 %zmm4, 576(%r9) +; AVX512-NEXT: vmovdqa64 %zmm21, 512(%r9) +; AVX512-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512-NEXT: addq $360, %rsp # imm = 0x168 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride5_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdx), %ymm22 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512-FCP-NEXT: subq $536, %rsp # imm = 0x218 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm24 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm24[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm31 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vprolq $16, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm31[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm23[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512-FCP-NEXT: vprolq $16, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm26[1,1,2,2] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vprolq $16, %ymm6, %ymm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm2 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm27[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %ymm30 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm31 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[3,0,3,0,7,4,7,4] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,0,3,0,7,4,7,4] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10,11],ymm6[12],ymm9[13],ymm6[14],ymm9[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,1,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX512-FCP-NEXT: vpandn %ymm9, %ymm15, %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm9 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm18 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm9 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10],ymm13[11],ymm9[12,13],ymm13[14],ymm9[15] -; AVX512-FCP-NEXT: vprolq $16, %ymm14, %ymm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,3,2,3,10,11,10,10] -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3,4],ymm13[5,6,7,8],ymm15[9],ymm13[10],ymm15[11,12],ymm13[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm13, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm14 ^ (zmm24 & (zmm7 ^ zmm14)) -; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm14 -; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm22 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm6 & (zmm22 ^ zmm7)) -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 -; AVX512-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm6, %ymm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX512-FCP-NEXT: vprolq $16, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,2,5,5,5,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm24 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,0,1],zmm1[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX512-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm22 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,1] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpandn %ymm3, %ymm8, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm28 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdx), %ymm17 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 32(%rdx), %xmm26 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm26[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm12 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm3[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13],ymm12[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX512-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5],xmm14[6],xmm15[7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm24 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,1,1] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512-FCP-NEXT: vpandnq %ymm14, %ymm16, %ymm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm14 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7,8],ymm7[9],ymm14[10],ymm7[11],ymm14[12,13],ymm7[14],ymm14[15] +; AVX512-FCP-NEXT: vprolq $16, %ymm11, %ymm11 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm5[2],ymm11[3],ymm5[4],ymm11[5,6],ymm5[7],ymm11[8,9],ymm5[10],ymm11[11],ymm5[12],ymm11[13,14],ymm5[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,11,10,10] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm11 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm14[1],ymm7[2],ymm14[3,4],ymm7[5,6,7,8],ymm14[9],ymm7[10],ymm14[11,12],ymm7[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,3,2,3,10,10,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3,4],ymm14[5,6,7,8],ymm15[9],ymm14[10],ymm15[11,12],ymm14[13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,3,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm14 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm27[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7,8],ymm9[9],ymm6[10],ymm9[11],ymm6[12,13],ymm9[14],ymm6[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm20, %ymm9 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 +; AVX512-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm20, %ymm25 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1],xmm2[2],xmm15[3],xmm2[4,5],xmm15[6],xmm2[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,0,1,8,9,8,8] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm0 ^ (zmm27 & (zmm30 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vprolq $16, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,1,4,5,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512-FCP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = mem[3,2,3,3,7,6,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm5[1],ymm3[2],ymm5[3,4],ymm3[5,6,7,8],ymm5[9],ymm3[10],ymm5[11,12],ymm3[13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm27 & (zmm1 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,3,2,2,8,9,8,9] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 ; AVX512-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [2,3,2,2,8,9,8,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm4 ^ (zmm2 & (zmm13 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,2,3,3,7,6,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512-FCP-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm8 & (zmm1 ^ zmm13)) -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX512-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,0,1],zmm9[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm10 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,0,1,8,9,8,8] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm9 ^ (zmm24 & (zmm4 ^ zmm9)) -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,3,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm6 ^ (zmm2 & (zmm10 ^ zmm6)) -; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm6 -; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm8 & (zmm6 ^ zmm10)) +; AVX512-FCP-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm20, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm3 & (zmm0 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm8 ^ (zmm1 & (zmm5 ^ zmm8)) ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm25[0,1,1,1] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm12 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm10[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2],xmm13[3],xmm12[4,5],xmm13[6],xmm12[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm24 & (zmm5 ^ zmm3)) -; AVX512-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,1] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm10 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm10 = mem ^ (zmm8 & (zmm10 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm20 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm20 = mem ^ (zmm8 & (zmm20 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm10 & zmm7) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm20 & zmm7) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm28 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm28 = mem ^ (zmm2 & (zmm28 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm19 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm19 = mem ^ (zmm2 & (zmm19 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm28 & zmm2) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm19 & zmm2) -; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm4)) -; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm5)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm10[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,2,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512-FCP-NEXT: vpandnq %zmm11, %zmm27, %zmm11 +; AVX512-FCP-NEXT: vpandq %zmm27, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpandnq %zmm14, %zmm1, %zmm13 +; AVX512-FCP-NEXT: vpandq %zmm1, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1] +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpbroadcastq 24(%r8), %ymm14 +; AVX512-FCP-NEXT: vpbroadcastq 32(%r8), %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm27 & (zmm8 ^ zmm10)) +; AVX512-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,1,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm22 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm22 = mem ^ (zmm10 & (zmm22 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm24 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm24 = mem ^ (zmm10 & (zmm24 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm18, %zmm16 +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = (zmm16 & zmm24) | mem +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm20 & (zmm7 | zmm11) +; AVX512-FCP-NEXT: vpbroadcastq 112(%r8), %ymm11 +; AVX512-FCP-NEXT: vpbroadcastq 120(%r8), %ymm17 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & ~zmm3) | zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 & (zmm6 | zmm13) +; AVX512-FCP-NEXT: vpbroadcastq 88(%r8), %ymm2 +; AVX512-FCP-NEXT: vpbroadcastq 96(%r8), %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & ~zmm15) | zmm6 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = mem ^ (zmm1 & (zmm6 ^ mem)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm10, %zmm3 +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = (zmm3 & zmm6) | mem +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = mem ^ (zmm1 & (zmm6 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm6 & zmm10) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 | (zmm22 & zmm18) +; AVX512-FCP-NEXT: vpbroadcastq 64(%r8), %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm12 & (zmm1 ^ zmm30)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 +; AVX512-FCP-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 384(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 576(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 512(%r9) -; AVX512-FCP-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 576(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 512(%r9) +; AVX512-FCP-NEXT: addq $536, %rsp # imm = 0x218 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride5_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $264, %rsp # imm = 0x108 -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm20 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm3 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4,5],xmm4[6],xmm3[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm14[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm5 -; AVX512DQ-NEXT: vpbroadcastq 104(%rdi), %xmm6 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm29 -; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm5 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] -; AVX512DQ-NEXT: vpandn %ymm4, %ymm3, %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm26 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm18 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %ymm30 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,2,3,3,7,6,7,7] -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512DQ-NEXT: subq $360, %rsp # imm = 0x168 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm0 +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %ymm19 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3,4],ymm4[5,6,7,8],ymm2[9],ymm4[10],ymm2[11,12],ymm4[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm17 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm15[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,2,3],zmm0[0,1,0,1] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3,4],ymm4[5,6,7,8],ymm3[9],ymm4[10],ymm3[11,12],ymm4[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm30 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm11 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm11[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm4[0,1,2,3],zmm3[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm22 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm1 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10,11],ymm1[12],ymm5[13],ymm1[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpbroadcastq 104(%rdi), %xmm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm28 +; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm1, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX512DQ-NEXT: vpandnq %ymm1, %ymm21, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm0, %ymm2 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm1, %xmm2 ; AVX512DQ-NEXT: vpbroadcastq 72(%rdi), %xmm4 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm4[0,1,0,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,1,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,5,6] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1] ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm2 -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7] -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm26 +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpbroadcastq 8(%rdi), %xmm4 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7] -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm5 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm2[0,1,0,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm26[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm10, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm10, %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm16 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5,6,7,8],ymm0[9],ymm1[10],ymm0[11,12],ymm1[13,14,15] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[3,2,3,3,7,6,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm0[0,1,2,3],zmm1[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm1[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7,8],ymm11[9],ymm4[10],ymm11[11],ymm4[12,13],ymm11[14],ymm4[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,2] -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[0,1,2,3],zmm10[0,1,0,1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8],ymm5[9],ymm0[10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,2,1,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,6] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm4[0,1,2,3],zmm10[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 32(%rdx), %ymm28 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm10 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4,5],ymm10[6],ymm3[7,8],ymm10[9],ymm3[10],ymm10[11],ymm3[12,13],ymm10[14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2],xmm8[3],xmm5[4,5],xmm8[6],xmm5[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm27 -; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm31 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm31[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm11, %xmm5 -; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm10 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5],xmm10[6],xmm5[7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,2,3],zmm10[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 32(%rdx), %ymm19 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm10, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm17 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7,8],ymm0[9],ymm8[10],ymm0[11],ymm8[12,13],ymm0[14],ymm8[15] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6],xmm1[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm26 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm26[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5],ymm3[6],ymm8[7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13],ymm3[14],ymm8[15] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpbroadcastq 40(%rdi), %xmm8 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm1 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm3[0,1,1,1] -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX512DQ-NEXT: vpandnq %ymm10, %ymm21, %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm22 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,1,1] +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpandnq %ymm5, %ymm21, %ymm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm6 +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm4, %xmm15 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm9, %xmm9 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,2,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0],xmm7[1],xmm3[2],xmm7[3],xmm3[4,5],xmm7[6],xmm3[7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm1, %ymm7 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] -; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm7 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[0,1,2,3],zmm3[2,3,2,3] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vprolq $16, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm14[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-NEXT: vpshufb %xmm7, %xmm4, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm3[0],xmm5[1],xmm3[2],xmm5[3],xmm3[4,5],xmm5[6],xmm3[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm11, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm30[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10,11],ymm5[12],ymm7[13],ymm5[14],ymm7[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm5[0,1,2,3],zmm3[2,3,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[0,1,2,1,4,5,6,5] +; AVX512DQ-NEXT: vprolq $16, %ymm2, %ymm5 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1],ymm3[2],ymm5[3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8,9],ymm3[10],ymm5[11],ymm3[12],ymm5[13,14],ymm3[15] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm13 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm2[1],ymm13[2,3],ymm2[4],ymm13[5],ymm2[6],ymm13[7,8],ymm2[9],ymm13[10,11],ymm2[12],ymm13[13],ymm2[14],ymm13[15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[3,2,3,3,7,6,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm15, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3],ymm2[4],ymm14[5,6],ymm2[7],ymm14[8,9],ymm2[10],ymm14[11],ymm2[12],ymm14[13,14],ymm2[15] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm15, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm17[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm30[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5],ymm6[6],ymm2[7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13],ymm6[14],ymm2[15] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[3,0,3,0,7,4,7,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2],ymm6[3],ymm14[4,5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10],ymm6[11],ymm14[12,13],ymm6[14],ymm14[15] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm6[0,1,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm31[0,1,2,1,4,5,6,5] -; AVX512DQ-NEXT: vprolq $16, %ymm8, %ymm14 -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1],ymm6[2],ymm14[3],ymm6[4],ymm14[5,6],ymm6[7],ymm14[8,9],ymm6[10],ymm14[11],ymm6[12],ymm14[13,14],ymm6[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512DQ-NEXT: vpermq $4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,0] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,3,2,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,2,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,2,6,7,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7,8],ymm1[9],ymm8[10],ymm1[11],ymm8[12,13],ymm1[14],ymm8[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm1[2,3,2,2] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,2,5,5,5,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[3,2,3,3,7,6,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm1[1],ymm4[2],ymm1[3,4],ymm4[5,6,7,8],ymm1[9],ymm4[10],ymm1[11,12],ymm4[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm29 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm29 = mem ^ (zmm28 & (zmm29 ^ mem)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm23 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm23 = mem ^ (zmm30 & (zmm23 ^ mem)) -; AVX512DQ-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512DQ-NEXT: vpbroadcastq 96(%r8), %ymm19 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm1, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm23)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm24 ^ (zmm30 & (zmm25 ^ zmm24)) -; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm17 -; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm23 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm17, %zmm17 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,2] -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm19 & (zmm17 ^ zmm25)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm27 ^ (zmm28 & (zmm16 ^ zmm27)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,1,1] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm23, %ymm25 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm5 +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm8[1],ymm5[2,3],ymm8[4],ymm5[5],ymm8[6],ymm5[7,8],ymm8[9],ymm5[10,11],ymm8[12],ymm5[13],ymm8[14],ymm5[15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm8 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpandq %zmm5, %zmm20, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 & (zmm11 | zmm8) +; AVX512DQ-NEXT: vpbroadcastq 88(%r8), %ymm8 +; AVX512DQ-NEXT: vpbroadcastq 96(%r8), %ymm16 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm8, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm20, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & ~zmm16) | zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm22 ^ (zmm5 & (zmm31 ^ zmm22)) +; AVX512DQ-NEXT: vpbroadcastq 24(%r8), %ymm11 +; AVX512DQ-NEXT: vpbroadcastq 32(%r8), %ymm22 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm11, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm16 & (zmm22 ^ zmm31)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm18 ^ (zmm16 & (zmm28 ^ zmm18)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm16 & (zmm1 ^ zmm9)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm31 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm21, %zmm16 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = (zmm16 & zmm1) | zmm6 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm8[0,1,1,1] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm8 -; AVX512DQ-NEXT: vpandnq 80(%r8){1to4}, %ymm23, %ymm27 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm8, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 | (zmm29 & zmm1) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm16 & zmm1) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm11, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11 ^ (zmm14 & (zmm11 ^ mem)) -; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm16 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm16, %zmm16 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm24 & (zmm16 ^ zmm11)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq $184, (%rsp), %zmm14, %zmm9 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ mem)) -; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm24 & (zmm10 ^ zmm9)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm9 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm9 ^ (zmm14 & (zmm5 ^ zmm9)) -; AVX512DQ-NEXT: vpbroadcastq 112(%r8), %ymm9 -; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm23 & (zmm9 ^ zmm5)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm6, %zmm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm14 & (zmm3 ^ zmm5)) -; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm4 -; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm5 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm23 & (zmm4 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm21 ^ (zmm30 & (zmm18 ^ zmm21)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm20 ^ (zmm30 & (zmm7 ^ zmm20)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm18 & zmm3) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & zmm3) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 576(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 192(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 320(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 512(%r9) -; AVX512DQ-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[3,0,3,0,7,4,7,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7,8],ymm0[9],ymm6[10],ymm0[11],ymm6[12,13],ymm0[14],ymm6[15] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[0,1,2,3],zmm3[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,2,6,7,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm10, %ymm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm19[1,1,1,2,5,5,5,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7,8],ymm9[9],ymm2[10,11],ymm9[12],ymm2[13],ymm9[14],ymm2[15] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm19[3,2,3,3,7,6,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3,4],ymm10[5,6,7,8],ymm11[9],ymm10[10],ymm11[11,12],ymm10[13,14,15] +; AVX512DQ-NEXT: vpandnq 16(%r8){1to4}, %ymm9, %ymm11 +; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm14 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm11 +; AVX512DQ-NEXT: vpandnq 80(%r8){1to4}, %ymm9, %ymm26 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm11, %zmm11 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm28 & zmm31) +; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm17 +; AVX512DQ-NEXT: vpermq $238, (%rsp), %ymm26 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm26 = mem[2,3,2,3] +; AVX512DQ-NEXT: vpermq $174, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm28 = mem[2,3,2,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,3,2,3] +; AVX512DQ-NEXT: vpermq $186, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm31 = mem[2,2,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm27[0,1,0,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm23[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,0] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm26, %zmm26 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm29, %zmm28 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpandnq %zmm26, %zmm29, %zmm26 +; AVX512DQ-NEXT: vpandq %zmm29, %zmm28, %zmm28 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm9 & (zmm28 | zmm26) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12 +; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm12 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm12 = zmm12 ^ (zmm29 & (zmm12 ^ mem)) +; AVX512DQ-NEXT: vpbroadcastq 64(%r8), %ymm15 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm30 & (zmm14 ^ zmm12)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = zmm4 ^ (zmm29 & (zmm4 ^ mem)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm30, %zmm12 +; AVX512DQ-NEXT: vpbroadcastq (%r8), %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm12 & (zmm1 ^ zmm4)) +; AVX512DQ-NEXT: vpbroadcastq 112(%r8), %ymm4 +; AVX512DQ-NEXT: vpbroadcastq 120(%r8), %ymm12 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm9, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & ~zmm12) | zmm28 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm29 & (zmm2 ^ zmm0)) +; AVX512DQ-NEXT: vpbroadcastq 48(%r8), %ymm0 +; AVX512DQ-NEXT: vpbroadcastq 56(%r8), %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm12 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm25 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm25 = mem ^ (zmm5 & (zmm25 ^ mem)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm18, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & zmm25) | zmm19 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm24 ^ (zmm5 & (zmm6 ^ zmm24)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm6 & zmm18) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 128(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 320(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 576(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 512(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 448(%r9) +; AVX512DQ-NEXT: addq $360, %rsp # imm = 0x168 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $360, %rsp # imm = 0x168 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdx), %ymm22 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] +; AVX512DQ-FCP-NEXT: subq $536, %rsp # imm = 0x218 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm24 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm24[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm31 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm25 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] +; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vprolq $16, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm31[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm23[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25,18,19,0,0,22,23,22,23,0,0,20,21,0,0,24,25] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm26 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm26[1,1,2,2] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm6, %ymm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm2 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm27[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %ymm30 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm31 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[3,0,3,0,7,4,7,4] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm2[0,1,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,0,3,0,7,4,7,4] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2],xmm9[3],xmm2[4,5],xmm9[6],xmm2[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,0] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm1[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10,11],ymm6[12],ymm9[13],ymm6[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm13 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5],xmm13[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm6[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6 -; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm15, %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm6, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm13, %zmm18 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7,8],ymm13[9],ymm9[10],ymm13[11],ymm9[12,13],ymm13[14],ymm9[15] -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm24[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,3,2,3,10,11,10,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm14 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0],ymm15[1],ymm13[2],ymm15[3,4],ymm13[5,6,7,8],ymm15[9],ymm13[10],ymm15[11,12],ymm13[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10,11],ymm13[12],ymm7[13],ymm13[14],ymm7[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,3,2,3,10,10,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm13, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm14 ^ (zmm24 & (zmm7 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm14 -; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm22 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm6 & (zmm22 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm25 -; AVX512DQ-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8],ymm15[9],ymm0[10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15] -; AVX512DQ-FCP-NEXT: vprolq $16, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3,4],ymm0[5,6,7,8],ymm2[9],ymm0[10],ymm2[11,12],ymm0[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,1,2,5,5,5,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm24 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm5[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3 +; AVX512DQ-FCP-NEXT: vpbroadcastq 104(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,12,13,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm8, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm16 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdx), %ymm17 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdx), %xmm26 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm26[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm3[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13],ymm12[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm15 +; AVX512DQ-FCP-NEXT: vpbroadcastq 40(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5],xmm14[6],xmm15[7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm12 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm12[0,1,1,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-FCP-NEXT: vpandnq %ymm14, %ymm16, %ymm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29,30,31,28,29,26,27,30,31,30,31,28,29,30,31,28,29] +; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7,8],ymm7[9],ymm14[10],ymm7[11],ymm14[12,13],ymm7[14],ymm14[15] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm11, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm5[2],ymm11[3],ymm5[4],ymm11[5,6],ymm5[7],ymm11[8,9],ymm5[10],ymm11[11],ymm5[12],ymm11[13,14],ymm5[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [2,3,2,3,10,11,10,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm11 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm9[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm7[0],ymm14[1],ymm7[2],ymm14[3,4],ymm7[5,6,7,8],ymm14[9],ymm7[10],ymm14[11,12],ymm7[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,3,2,3,10,10,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm9 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2],ymm15[3,4],ymm14[5,6,7,8],ymm15[9],ymm14[10],ymm15[11,12],ymm14[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,3,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm27[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7,8],ymm9[9],ymm6[10],ymm9[11],ymm6[12,13],ymm9[14],ymm6[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpandnq 16(%r8){1to4}, %ymm20, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vpandnq 80(%r8){1to4}, %ymm20, %ymm25 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1],xmm2[2],xmm15[3],xmm2[4,5],xmm15[6],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,1,0,1,8,9,8,8] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm0 ^ (zmm27 & (zmm30 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vprolq $16, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,1,4,5,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11],ymm2[12],ymm1[13,14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6,7,8],ymm1[9],ymm0[10],ymm1[11,12],ymm0[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[1,1,1,2,5,5,5,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd $251, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm5 = mem[3,2,3,3,7,6,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm5[1],ymm3[2],ymm5[3,4],ymm3[5,6,7,8],ymm5[9],ymm3[10],ymm5[11,12],ymm3[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm16 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm29, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm27 & (zmm1 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,3,2,3,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,3,2,2,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm5 ; AVX512DQ-FCP-NEXT: vpbroadcastq 48(%r8), %ymm0 -; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm30[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2],ymm3[3,4],ymm2[5,6,7,8],ymm3[9],ymm2[10],ymm3[11,12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4,5],ymm5[6],ymm2[7,8],ymm5[9],ymm2[10],ymm5[11],ymm2[12,13],ymm5[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [2,3,2,2,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm4 ^ (zmm2 & (zmm13 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,2,3,3,7,6,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm6[1],ymm4[2],ymm6[3,4],ymm4[5,6,7,8],ymm6[9],ymm4[10],ymm6[11,12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vpbroadcastq 88(%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm8 & (zmm1 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq 72(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,0,1],zmm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2],xmm13[3],xmm10[4,5],xmm13[6],xmm10[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,1,0,1,8,9,8,8] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm11, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm9 ^ (zmm24 & (zmm4 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm10 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm31[2,3,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm6 ^ (zmm2 & (zmm10 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm8 & (zmm6 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpbroadcastq 56(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm20, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm3 & (zmm0 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm8 ^ (zmm1 & (zmm5 ^ zmm8)) ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm25[0,1,1,1] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm12 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm10[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,2,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2],xmm13[3],xmm12[4,5],xmm13[6],xmm12[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm24 & (zmm5 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm10 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm10 = mem ^ (zmm8 & (zmm10 ^ mem)) -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm20 = mem ^ (zmm8 & (zmm20 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm10 & zmm7) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm20 & zmm7) -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm28 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm28 = mem ^ (zmm2 & (zmm28 ^ mem)) -; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm19 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm19 = mem ^ (zmm2 & (zmm19 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 | (zmm28 & zmm2) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 | (zmm19 & zmm2) -; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm8 & (zmm2 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%r9) +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm10[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,2,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1],xmm12[2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpandnq %zmm11, %zmm27, %zmm11 +; AVX512DQ-FCP-NEXT: vpandq %zmm27, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpandnq %zmm14, %zmm1, %zmm13 +; AVX512DQ-FCP-NEXT: vpandq %zmm1, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpbroadcastq 24(%r8), %ymm14 +; AVX512DQ-FCP-NEXT: vpbroadcastq 32(%r8), %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm27 & (zmm8 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpermq $84, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm22 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm22 = mem ^ (zmm10 & (zmm22 ^ mem)) +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm24 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm24 = mem ^ (zmm10 & (zmm24 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm18, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm16 = (zmm16 & zmm24) | mem +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm20 & (zmm7 | zmm11) +; AVX512DQ-FCP-NEXT: vpbroadcastq 112(%r8), %ymm11 +; AVX512DQ-FCP-NEXT: vpbroadcastq 120(%r8), %ymm17 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & ~zmm3) | zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 & (zmm6 | zmm13) +; AVX512DQ-FCP-NEXT: vpbroadcastq 88(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastq 96(%r8), %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & ~zmm15) | zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = mem ^ (zmm1 & (zmm6 ^ mem)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm10, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = (zmm3 & zmm6) | mem +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = mem ^ (zmm1 & (zmm6 ^ mem)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm6 & zmm10) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 | (zmm22 & zmm18) +; AVX512DQ-FCP-NEXT: vpbroadcastq 64(%r8), %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm12 & (zmm1 ^ zmm30)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastq (%r8), %ymm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm6 & (zmm5 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 384(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 576(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 512(%r9) -; AVX512DQ-FCP-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 384(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 576(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 512(%r9) +; AVX512DQ-FCP-NEXT: addq $536, %rsp # imm = 0x218 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index 5e26564465c25..673cc4f875e02 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -3911,872 +3911,836 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride6_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $40, %rsp -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: subq $184, %rsp +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm1 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-NEXT: vpermt2d %zmm3, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm3[1,1,1,1] +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpshufb %xmm7, %xmm5, %xmm5 +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm25 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm16, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 32(%r9), %xmm21 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[2,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,1,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm0[2,2,2,2] +; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} ymm5 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[12],ymm15[12],ymm7[13],ymm15[13],ymm7[14],ymm15[14],ymm7[15],ymm15[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm0[3,3,3,3] -; AVX512-NEXT: vmovdqa (%rcx), %ymm0 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm3 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[2],ymm1[2],ymm8[3],ymm1[3],ymm8[8],ymm1[8],ymm8[9],ymm1[9],ymm8[10],ymm1[10],ymm8[11],ymm1[11] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16 -; AVX512-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512-NEXT: vpermt2d %zmm1, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqa (%rsi), %xmm6 -; AVX512-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm17 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512-NEXT: vmovdqa (%r8), %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm8[2,2,2,2] -; AVX512-NEXT: vmovdqa (%r9), %xmm10 -; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm8[0,1,0,1] -; AVX512-NEXT: vmovdqa (%r9), %ymm8 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm1 -; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm22, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm22 & (zmm20 ^ zmm0)) -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm2 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-NEXT: vpermt2d %zmm1, %zmm21, %zmm0 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm1[1,1,1,1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[8],ymm15[8],ymm7[9],ymm15[9],ymm7[10],ymm15[10],ymm7[11],ymm15[11] -; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm1[2,2,2,3] -; AVX512-NEXT: vmovdqa 32(%r8), %xmm7 -; AVX512-NEXT: vpshufb %xmm13, %xmm7, %xmm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm1[0,1,0,1] -; AVX512-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm1[2,2,2,2] -; AVX512-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm1[0,1,0,1] -; AVX512-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm21 -; AVX512-NEXT: vmovdqa32 %zmm21, %zmm0 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm21 -; AVX512-NEXT: vmovdqa32 %zmm21, %zmm0 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm26, %zmm27 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm22 & (zmm27 ^ zmm0)) -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[2],ymm0[2],ymm7[3],ymm0[3],ymm7[8],ymm0[8],ymm7[9],ymm0[9],ymm7[10],ymm0[10],ymm7[11],ymm0[11] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; AVX512-NEXT: vpermt2d %zmm2, %zmm6, %zmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512-NEXT: vmovdqa (%rdi), %xmm6 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm2[1,1,1,1] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm1[2,2,2,3] +; AVX512-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm1[0,1,0,1] +; AVX512-NEXT: vmovdqa (%r8), %ymm1 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm2[2,2,2,2] +; AVX512-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,1,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm2[0,1,0,1] +; AVX512-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm17, %zmm17 +; AVX512-NEXT: vmovdqa32 %zmm17, %zmm0 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm17 +; AVX512-NEXT: vmovdqa32 %zmm17, %zmm0 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm25 +; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm24 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm24 & (zmm25 ^ zmm0)) +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} xmm10 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-NEXT: vpermt2d %zmm4, %zmm30, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm29[3,3,3,3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm10[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm4[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512-NEXT: vpshufb %ymm9, %ymm14, %ymm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm10[2,1,2,3] +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k2} +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512-NEXT: vpermt2d %zmm2, %zmm26, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm3[2,2,2,3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512-NEXT: vpshufb %ymm4, %ymm14, %ymm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm5[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm2[2,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm1[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm29[2,2,2,2] -; AVX512-NEXT: vpshufd {{.*#+}} ymm24 = ymm31[1,2,3,3,5,6,7,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm18[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm16[3,3,3,3] -; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm1 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,1,2,3] -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm28 {%k2} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrldq {{.*#+}} xmm6 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512-NEXT: vpermt2d %zmm1, %zmm26, %zmm0 -; AVX512-NEXT: vmovdqa64 %xmm17, %xmm12 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm17[0,1,2,1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm0[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm28[2,2,2,2] +; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[1,2,3,3,5,6,7,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm20[2,1,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm18[3,3,3,3] +; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm9 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm9[2,1,2,3] +; AVX512-NEXT: vpermt2d %zmm11, %zmm30, %zmm10 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[0,1,2,3],zmm1[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpbroadcastq %xmm6, %ymm6 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm10, %zmm10 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm21 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm5[0,0,2,1,4,5,6,7] ; AVX512-NEXT: vpbroadcastq %xmm12, %ymm12 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm15[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm9[0,1,2,3],zmm1[0,1,0,1] -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm9 -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm30[0],zero,xmm30[1],zero,xmm30[2],zero,xmm30[3],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm15, %ymm15 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512-NEXT: # ymm17 = mem[2,2,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm16, %zmm16 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 -; AVX512-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] +; AVX512-NEXT: vpternlogq $186, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = (zmm10 & ~zmm24) | mem +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload +; AVX512-NEXT: # ymm20 = mem[2,2,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm19, %zmm19 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm16 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm18, %zmm20 +; AVX512-NEXT: vmovdqa32 %zmm19, %zmm20 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm29, %zmm19 ; AVX512-NEXT: movw $-28087, %cx # imm = 0x9249 ; AVX512-NEXT: kmovw %ecx, %k2 -; AVX512-NEXT: vmovdqa32 %zmm16, %zmm17 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm23, %zmm16 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm18 & (zmm16 ^ zmm17)) -; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm24[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm4, %zmm3 -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ zmm2)) -; AVX512-NEXT: vpbroadcastq %xmm11, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm28 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm28)) -; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm4 & (zmm0 ^ zmm1)) -; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm27, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: vmovdqa32 %zmm19, %zmm20 {%k2} +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm16 & (zmm15 ^ zmm20)) +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm14 +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm13, %zmm13 +; AVX512-NEXT: vmovdqa32 %zmm14, %zmm13 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm13 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm9, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm13)) +; AVX512-NEXT: vpbroadcastq %xmm8, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm4)) +; AVX512-NEXT: vpbroadcastq %xmm21, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 +; AVX512-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm12, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm22, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm4 & (zmm1 ^ zmm3)) +; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rax) +; AVX512-NEXT: addq $184, %rsp ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride6_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $72, %rsp -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm7 ; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [8,21,10,11,20,13,14,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm30 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm10 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm31 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,0,3,10,0,10,11] -; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm10, %zmm19 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm12 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm28 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm13 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm12 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm13 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1 -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm29 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm4 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,8,8,0,9] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,8,0,9,0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [0,9,2,3,8,5,6,11] +; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm28, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm22 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,0,1,10,10,10,10] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm23, %zmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm11 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [2,1,2,3,11,11,11,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm25, %ymm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [8,21,10,11,20,13,14,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm27 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,10,9,10,11] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm20 +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm7 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm23, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm11[4],ymm4[5],ymm11[5],ymm4[6],ymm11[6],ymm4[7],ymm11[7],ymm4[12],ymm11[12],ymm4[13],ymm11[13],ymm4[14],ymm11[14],ymm4[15],ymm11[15] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm7 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm0, %ymm25 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm23 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm6 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm8 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm30 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm9 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [8,8,0,9,0,1,0,1] +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm15 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm12 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm21, %zmm12 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] -; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm22, %ymm24 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm21 -; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm6 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm3, %ymm22 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm22, %zmm22 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11] -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [8,9,20,11,12,21,14,15] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,1,0,10,10,0] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm7 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[2],mem[2],ymm8[3],mem[3],ymm8[8],mem[8],ymm8[9],mem[9],ymm8[10],mem[10],ymm8[11],mem[11] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm8 -; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm8, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm15 {%k2} +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm18[0],zero,xmm18[1],zero,xmm18[2],zero,xmm18[3],zero +; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm15 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpermt2d %ymm8, %ymm14, %ymm24 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm24, %zmm8 +; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm13 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX512-FCP-NEXT: vpermi2d %ymm15, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[1],ymm11[1],ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[8],ymm11[8],ymm4[9],ymm11[9],ymm4[10],ymm11[10],ymm4[11],ymm11[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 +; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1} +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm2, %ymm28 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm6 & (zmm2 ^ zmm1)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6)) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq $184, (%rsp), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm0 & (zmm19 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm0 & (zmm20 ^ zmm29)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm0 & (zmm23 ^ zmm21)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm0 & (zmm18 ^ zmm22)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512-FCP-NEXT: addq $72, %rsp +; AVX512-FCP-NEXT: vpandq %zmm4, %zmm22, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm3 & ~zmm6) | zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm16 ^ (zmm2 & (zmm23 ^ zmm16)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm27)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm8)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm30 ^ (zmm1 & (zmm0 ^ zmm30)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride6_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $152, %rsp -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm10 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm30 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-NEXT: pushq %rax +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm14 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm23 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm3[0,1,2,3],zmm1[0,1,0,1] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm1[0,1,0,1] ; AVX512DQ-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm27 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,3,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm29 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm0[0,0,2,1] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm9 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,2,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm17 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm2[0,1,2,3],zmm1[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm28 {%k1} -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm2[0,1,2,3],zmm1[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa32 %zmm0, %zmm30 {%k1} +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm21 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm19 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm12 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm12[2,1,2,3,6,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm14 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,1,2,3,6,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm0[2,1,2,3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm0[3,3,3,3] -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm0[2,2,2,2] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm19 = ymm0[1,2,3,3,5,6,7,7] -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,0,2,1] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm6 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm11 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[8],ymm6[8],ymm11[9],ymm6[9],ymm11[10],ymm6[10],ymm11[11],ymm6[11] -; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm16 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm11[2,2,2,3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm31 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm31, %zmm4 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm2[2,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm3 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm3[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm8[1,1,1,1] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm31, %zmm5 -; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm15, %xmm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm14 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm0[2,1,2,3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm3[4],ymm10[5],ymm3[5],ymm10[6],ymm3[6],ymm10[7],ymm3[7],ymm10[12],ymm3[12],ymm10[13],ymm3[13],ymm10[14],ymm3[14],ymm10[15],ymm3[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm0[3,3,3,3] +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm2 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm14 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm14[0],ymm2[0],ymm14[1],ymm2[1],ymm14[2],ymm2[2],ymm14[3],ymm2[3],ymm14[8],ymm2[8],ymm14[9],ymm2[9],ymm14[10],ymm2[10],ymm14[11],ymm2[11] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm2[2,2,2,2] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm22 = ymm2[1,2,3,3,5,6,7,7] +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm14[2,2,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm2[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm14[2,1,2,3] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm15[2,2,2,3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm4[2,1,2,3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm1, %zmm12 +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512DQ-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm3, %zmm4 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm15, %zmm13, %zmm4 {%k1} -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm2[1,1,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[2,2,2,3] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm17[2,1,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm16[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[1,2,3,3,5,6,7,7] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm14[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm6, %zmm2, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm18[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm29 {%k2} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm10[0],ymm3[0],ymm10[1],ymm3[1],ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[8],ymm3[8],ymm10[9],ymm3[9],ymm10[10],ymm3[10],ymm10[11],ymm3[11] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm5[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm3, %zmm0, %zmm12 {%k2} +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[2],ymm10[2],ymm0[3],ymm10[3],ymm0[8],ymm10[8],ymm0[9],ymm10[9],ymm0[10],ymm10[10],ymm0[11],ymm10[11] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm11, %xmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm12 {%k1} +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm2 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm5[4],ymm13[5],ymm5[5],ymm13[6],ymm5[6],ymm13[7],ymm5[7],ymm13[12],ymm5[12],ymm13[13],ymm5[13],ymm13[14],ymm5[14],ymm13[15],ymm5[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,2,3,3,5,6,7,7] +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,1,1,1] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm13[0],ymm5[0],ymm13[1],ymm5[1],ymm13[2],ymm5[2],ymm13[3],ymm5[3],ymm13[8],ymm5[8],ymm13[9],ymm5[9],ymm13[10],ymm5[10],ymm13[11],ymm5[11] +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm13 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm5, %zmm8, %zmm3 {%k2} +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero +; AVX512DQ-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm14 & (zmm6 ^ zmm4)) -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX512DQ-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = mem[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm15, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm14 & (zmm7 ^ zmm5)) +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm11, %zmm6, %zmm3 {%k1} +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm6, %zmm12, %zmm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm19[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm6, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm14 & (zmm13 ^ zmm3)) +; AVX512DQ-NEXT: vpbroadcastq %xmm8, %ymm3 +; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm30 {%k2} # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm26 & (zmm3 ^ zmm29)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm21, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm26, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm8 ^ (zmm21 & (zmm30 ^ zmm8)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm22[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm23, %zmm8 {%k2} +; AVX512DQ-NEXT: movw $-28087, %ax # imm = 0x9249 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm20, %zmm18, %zmm8 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm7[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm15, %zmm0 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm25, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm19 & (zmm1 ^ zmm8)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm19, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & ~zmm14) | zmm11 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 {%k2} # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm27)) -; AVX512DQ-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm28 {%k2} # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm5 & (zmm10 ^ zmm28)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm21, %zmm22, %zmm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm19[2,2,2,3] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm14, %zmm20, %zmm5 {%k2} -; AVX512DQ-NEXT: movw $-28087, %cx # imm = 0x9249 -; AVX512DQ-NEXT: kmovw %ecx, %k1 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm29, %zmm24, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm30, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm14 & (zmm3 ^ zmm5)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm5 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm11[2,2,2,3] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm12, %zmm5 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm13, %zmm5 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm14 & (zmm1 ^ zmm5)) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-NEXT: addq $152, %rsp +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) +; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $72, %rsp -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm7 ; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm30 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm31 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,0,3,10,0,10,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm10, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm10 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm12 = ymm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm7[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm28 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm13 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm4, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,0,8,8,0,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,0,8,8,0,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [8,8,0,9,0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm28, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,1,0,1,10,10,10,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm6 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm23, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm11 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm8[0],ymm11[1],ymm8[1],ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[8],ymm8[8],ymm11[9],ymm8[9],ymm11[10],ymm8[10],ymm11[11],ymm8[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,10,9,10,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm4 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm7, %zmm20 +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm7 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm23, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm4[4],ymm11[4],ymm4[5],ymm11[5],ymm4[6],ymm11[6],ymm4[7],ymm11[7],ymm4[12],ymm11[12],ymm4[13],ymm11[13],ymm4[14],ymm11[14],ymm4[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm7 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm0, %ymm25 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm23 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm6 = xmm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm9 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [8,8,0,9,0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm9 ; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm25, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm12 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm21, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm22, %ymm24 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm3, %ymm22 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm22, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm12, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [0,0,0,1,0,10,10,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm14 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm8 = ymm8[0],mem[0],ymm8[1],mem[1],ymm8[2],mem[2],ymm8[3],mem[3],ymm8[8],mem[8],ymm8[9],mem[9],ymm8[10],mem[10],ymm8[11],mem[11] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm4, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm0 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm15 {%k2} +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm24, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm8 = xmm18[0],zero,xmm18[1],zero,xmm18[2],zero,xmm18[3],zero +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm26, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm8, %ymm14, %ymm24 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm24, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm13, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm13 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm15 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero +; AVX512DQ-FCP-NEXT: vpermi2d %ymm15, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm11[0],ymm4[1],ymm11[1],ymm4[2],ymm11[2],ymm4[3],ymm11[3],ymm4[8],ymm11[8],ymm4[9],ymm11[9],ymm4[10],ymm11[10],ymm4[11],ymm11[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm11, %zmm10 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm31, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm2, %ymm28 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm6 & (zmm2 ^ zmm1)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6)) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq $184, (%rsp), %zmm0, %zmm19 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm19 = zmm19 ^ (zmm0 & (zmm19 ^ mem)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm0 & (zmm20 ^ zmm29)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm0 & (zmm23 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm0 & (zmm18 ^ zmm22)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-FCP-NEXT: addq $72, %rsp +; AVX512DQ-FCP-NEXT: vpandq %zmm4, %zmm22, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm3 & ~zmm6) | zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm16 ^ (zmm2 & (zmm23 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ zmm27)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm5 & (zmm20 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm30 ^ (zmm1 & (zmm0 ^ zmm30)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 320(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -8262,957 +8226,958 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride6_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $744, %rsp # imm = 0x2E8 -; AVX512-NEXT: vmovdqa 96(%rcx), %ymm9 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512-NEXT: subq $1032, %rsp # imm = 0x408 +; AVX512-NEXT: vmovdqa 96(%rcx), %xmm0 +; AVX512-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-NEXT: vmovdqa 96(%rcx), %ymm1 +; AVX512-NEXT: vmovdqa 96(%rdx), %ymm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm7 +; AVX512-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm5, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,1,1,1] +; AVX512-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 +; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vmovdqa32 %zmm6, %zmm7 {%k1} +; AVX512-NEXT: vmovdqa 96(%r8), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm18 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[0,1,0,1] +; AVX512-NEXT: vmovdqa 96(%r8), %ymm6 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512-NEXT: kmovw %eax, %k2 +; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k2} +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm16, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%r9), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,0,1] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%r9), %ymm0 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} ymm7 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} ymm8 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 96(%rsi), %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vmovdqa 96(%rdi), %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 64(%rcx), %ymm4 ; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm16 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm27 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm30 -; AVX512-NEXT: vmovdqa 64(%rsi), %ymm15 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512-NEXT: vpsrldq {{.*#+}} ymm2 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[12],ymm4[12],ymm9[13],ymm4[13],ymm9[14],ymm4[14],ymm9[15],ymm4[15] +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm26 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%rsi), %ymm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,1,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm14[4],ymm6[4],ymm14[5],ymm6[5],ymm14[6],ymm6[6],ymm14[7],ymm6[7],ymm14[12],ymm6[12],ymm14[13],ymm6[13],ymm14[14],ymm6[14],ymm14[15],ymm6[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm2[3,3,3,3] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm24 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm11 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm5 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 32(%rsi), %ymm29 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm29[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm25 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm1 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vmovdqa (%rdx), %ymm12 -; AVX512-NEXT: vpsrldq {{.*#+}} ymm6 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %ymm13 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vmovdqa (%rdi), %ymm14 -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[2,1,2,3,6,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[8],ymm1[8],ymm6[9],ymm1[9],ymm6[10],ymm1[10],ymm6[11],ymm1[11] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm12[4],ymm5[4],ymm12[5],ymm5[5],ymm12[6],ymm5[6],ymm12[7],ymm5[7],ymm12[12],ymm5[12],ymm12[13],ymm5[13],ymm12[14],ymm5[14],ymm12[15],ymm5[15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm5[0],ymm12[1],ymm5[1],ymm12[2],ymm5[2],ymm12[3],ymm5[3],ymm12[8],ymm5[8],ymm12[9],ymm5[9],ymm12[10],ymm5[10],ymm12[11],ymm5[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512-NEXT: vmovdqa (%rcx), %xmm5 -; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} ymm4 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512-NEXT: vpsrldq {{.*#+}} ymm7 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm12 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512-NEXT: vpsrldq {{.*#+}} ymm4 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vmovdqa (%rdx), %ymm15 +; AVX512-NEXT: vpsrldq {{.*#+}} ymm7 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm4[0],ymm7[1],ymm4[1],ymm7[2],ymm4[2],ymm7[3],ymm4[3],ymm7[8],ymm4[8],ymm7[9],ymm4[9],ymm7[10],ymm4[10],ymm7[11],ymm4[11] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[2,1,2,3,6,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm15[4],ymm11[4],ymm15[5],ymm11[5],ymm15[6],ymm11[6],ymm15[7],ymm11[7],ymm15[12],ymm11[12],ymm15[13],ymm11[13],ymm15[14],ymm11[14],ymm15[15],ymm11[15] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm15[0],ymm11[0],ymm15[1],ymm11[1],ymm15[2],ymm11[2],ymm15[3],ymm11[3],ymm15[8],ymm11[8],ymm15[9],ymm11[9],ymm15[10],ymm11[10],ymm15[11],ymm11[11] +; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa (%rdx), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX512-NEXT: vpermt2d %zmm1, %zmm20, %zmm12 -; AVX512-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15] -; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512-NEXT: vmovdqa (%r8), %xmm6 -; AVX512-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vmovdqa (%r8), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512-NEXT: vmovdqa (%r9), %xmm10 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 +; AVX512-NEXT: vpermt2d %zmm4, %zmm19, %zmm15 +; AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm7[4],ymm10[5],ymm7[5],ymm10[6],ymm7[6],ymm10[7],ymm7[7],ymm10[12],ymm7[12],ymm10[13],ymm7[13],ymm10[14],ymm7[14],ymm10[15],ymm7[15] +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[8],ymm7[8],ymm10[9],ymm7[9],ymm10[10],ymm7[10],ymm10[11],ymm7[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512-NEXT: vmovdqa (%r8), %xmm10 ; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0 +; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] +; AVX512-NEXT: vmovdqa (%r8), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512-NEXT: vmovdqa (%r9), %ymm10 -; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 -; AVX512-NEXT: movw $18724, %ax # imm = 0x4924 -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm12 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm1 -; AVX512-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm12 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm12)) -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm9[4],ymm0[5],ymm9[5],ymm0[6],ymm9[6],ymm0[7],ymm9[7],ymm0[12],ymm9[12],ymm0[13],ymm9[13],ymm0[14],ymm9[14],ymm0[15],ymm9[15] -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm31 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm9[0],ymm0[1],ymm9[1],ymm0[2],ymm9[2],ymm0[3],ymm9[3],ymm0[8],ymm9[8],ymm0[9],ymm9[9],ymm0[10],ymm9[10],ymm0[11],ymm9[11] -; AVX512-NEXT: vmovdqa 96(%rcx), %xmm5 -; AVX512-NEXT: vmovdqa 96(%rdx), %xmm12 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] -; AVX512-NEXT: vpermt2d %zmm1, %zmm20, %zmm0 -; AVX512-NEXT: vmovdqa 96(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15] -; AVX512-NEXT: vmovdqa64 %ymm7, %ymm18 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[2,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,1,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm13[0,1,0,1] -; AVX512-NEXT: vmovdqa 96(%r9), %ymm14 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2 -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqa32 %zmm2, %zmm0 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm17, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm21 & (zmm2 ^ zmm0)) -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,0,1] -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} +; AVX512-NEXT: vmovdqa (%r9), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512-NEXT: vmovdqa32 %zmm4, %zmm15 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 +; AVX512-NEXT: vmovdqa32 %zmm4, %zmm15 {%k2} +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm23 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm23 & (zmm1 ^ zmm15)) ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rcx), %xmm9 -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm12 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm6 -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-NEXT: vpermt2d %zmm3, %zmm28, %zmm0 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm7 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[0,1,2,3],zmm1[0,1,0,1] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm27, %ymm4 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[12],ymm1[12],ymm4[13],ymm1[13],ymm4[14],ymm1[14],ymm4[15],ymm1[15] -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm16 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[8],ymm1[8],ymm4[9],ymm1[9],ymm4[10],ymm1[10],ymm4[11],ymm1[11] -; AVX512-NEXT: vpermt2d %zmm3, %zmm20, %zmm1 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm22[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm4[0,1,2,3],zmm3[0,1,0,1] +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm30 {%k2} +; AVX512-NEXT: vmovdqa 64(%rcx), %xmm0 +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} xmm7 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-NEXT: vpermt2d %zmm4, %zmm28, %zmm13 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm7 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm15[0,1,2,3],zmm4[0,1,0,1] +; AVX512-NEXT: vmovdqa32 %zmm13, %zmm24 {%k2} +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[8],ymm0[8],ymm9[9],ymm0[9],ymm9[10],ymm0[10],ymm9[11],ymm0[11] +; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm0 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,1,1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[1],ymm6[1],ymm14[2],ymm6[2],ymm14[3],ymm6[3],ymm14[8],ymm6[8],ymm14[9],ymm6[9],ymm14[10],ymm6[10],ymm14[11],ymm6[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512-NEXT: vmovdqa 64(%r8), %xmm4 +; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm11 +; AVX512-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512-NEXT: vmovdqa64 64(%r9), %xmm29 +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm29[2,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,1,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm27 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm23 & (zmm27 ^ zmm0)) +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm14 = ymm2[4],ymm5[4],ymm2[5],ymm5[5],ymm2[6],ymm5[6],ymm2[7],ymm5[7],ymm2[12],ymm5[12],ymm2[13],ymm5[13],ymm2[14],ymm5[14],ymm2[15],ymm5[15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm2[0],ymm5[0],ymm2[1],ymm5[1],ymm2[2],ymm5[2],ymm2[3],ymm5[3],ymm2[8],ymm5[8],ymm2[9],ymm5[9],ymm2[10],ymm5[10],ymm2[11],ymm5[11] +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm10 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm15[4],ymm8[5],ymm15[5],ymm8[6],ymm15[6],ymm8[7],ymm15[7],ymm8[12],ymm15[12],ymm8[13],ymm15[13],ymm8[14],ymm15[14],ymm8[15],ymm15[15] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm15[0],ymm8[1],ymm15[1],ymm8[2],ymm15[2],ymm8[3],ymm15[3],ymm8[8],ymm15[8],ymm8[9],ymm15[9],ymm8[10],ymm15[10],ymm8[11],ymm15[11] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vmovdqa 64(%r8), %xmm3 -; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm9 -; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512-NEXT: vmovdqa 64(%r8), %ymm10 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512-NEXT: vmovdqa 64(%r9), %xmm5 -; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vmovdqa 64(%r9), %ymm12 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm12[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm1)) -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm0[4],ymm11[5],ymm0[5],ymm11[6],ymm0[6],ymm11[7],ymm0[7],ymm11[12],ymm0[12],ymm11[13],ymm0[13],ymm11[14],ymm0[14],ymm11[15],ymm0[15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[1],ymm0[1],ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[8],ymm0[8],ymm11[9],ymm0[9],ymm11[10],ymm0[10],ymm11[11],ymm0[11] -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm2 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-NEXT: vpermt2d %zmm3, %zmm20, %zmm0 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm5[1,1,1,1] -; AVX512-NEXT: vmovdqa64 %ymm29, %ymm5 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] -; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] -; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3] -; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 -; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm5[0,1,0,1] -; AVX512-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm8[2,2,2,2] -; AVX512-NEXT: vmovdqa 32(%r9), %xmm5 -; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[2,3,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm11[0,1,0,1] -; AVX512-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm20, %zmm20 -; AVX512-NEXT: vmovdqa32 %zmm20, %zmm0 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm20 -; AVX512-NEXT: vmovdqa32 %zmm20, %zmm0 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm27, %zmm20 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm21 & (zmm20 ^ zmm0)) -; AVX512-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload -; AVX512-NEXT: # ymm21 = mem[2,2,2,2] -; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[1,2,3,3,5,6,7,7] -; AVX512-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload -; AVX512-NEXT: # ymm24 = mem[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm18[3,3,3,3] -; AVX512-NEXT: vmovdqa32 %zmm6, %zmm19 {%k2} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm5 -; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vmovdqa64 %ymm28, %ymm6 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm2[2,2,2,3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm4[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm2[0,1,2,3],zmm1[0,1,0,1] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm8[4],ymm12[4],ymm8[5],ymm12[5],ymm8[6],ymm12[6],ymm8[7],ymm12[7],ymm8[12],ymm12[12],ymm8[13],ymm12[13],ymm8[14],ymm12[14],ymm8[15],ymm12[15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[8],ymm12[8],ymm8[9],ymm12[9],ymm8[10],ymm12[10],ymm8[11],ymm12[11] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,2,2,3] +; AVX512-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,0,1] +; AVX512-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm3[2,2,2,2] +; AVX512-NEXT: vmovdqa64 32(%r9), %xmm26 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm26[2,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm3[0,1,0,1] +; AVX512-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm10 {%k2} +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm19, %zmm25 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm23 & (zmm25 ^ zmm10)) +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpsrldq {{.*#+}} xmm7 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-NEXT: vpermt2d %zmm6, %zmm28, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm6[2,2,2,3] +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm7 +; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm6[2,1,2,3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm1[2,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm1[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,2,3,6,7,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,2,2,2] -; AVX512-NEXT: vpshufd {{.*#+}} ymm31 = ymm16[1,2,3,3,5,6,7,7] -; AVX512-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512-NEXT: # ymm17 = mem[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm22[3,3,3,3] -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm18 {%k2} +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,1,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,2] +; AVX512-NEXT: vpshufd {{.*#+}} ymm18 = ymm14[1,2,3,3,5,6,7,7] +; AVX512-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512-NEXT: # ymm19 = mem[2,1,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm4[3,3,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,0,2,1] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm4[0,1,2,3],zmm8[0,1,0,1] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm4[2,2,2,3] +; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX512-NEXT: vmovdqa %ymm7, %ymm9 +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,1,2,3] +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm8 {%k2} +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpsrldq {{.*#+}} xmm6 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm6[2,2,2,3] -; AVX512-NEXT: vpshufb %ymm8, %ymm10, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm6[2,1,2,3] -; AVX512-NEXT: vpermt2d %zmm0, %zmm5, %zmm4 -; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,2,1] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,1,2,3] +; AVX512-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512-NEXT: # ymm5 = mem[2,2,2,2] +; AVX512-NEXT: vpshufd $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[1,2,3,3,5,6,7,7] +; AVX512-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512-NEXT: # ymm4 = mem[2,1,2,3] +; AVX512-NEXT: vpermq $255, (%rsp), %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[3,3,3,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq $186, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: # zmm17 = (zmm17 & ~zmm23) | mem +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[2,2,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1} +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512-NEXT: movw $-28087, %ax # imm = 0x9249 +; AVX512-NEXT: kmovw %eax, %k3 +; AVX512-NEXT: vmovdqa32 %zmm2, %zmm11 {%k3} +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload +; AVX512-NEXT: # ymm2 = mem[2,2,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm23, %zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm23 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa32 %zmm2, %zmm23 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm21, %zmm22, %zmm2 +; AVX512-NEXT: vmovdqa32 %zmm2, %zmm23 {%k3} +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm16, %zmm16 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm21, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm23)) +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm18[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm20, %zmm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512-NEXT: vmovdqa32 %zmm13, %zmm15 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10 +; AVX512-NEXT: vmovdqa32 %zmm10, %zmm15 {%k3} +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm15)) +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm0[2,1,2,3] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm0 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512-NEXT: vpermt2d %zmm7, %zmm28, %zmm14 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,2,1] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,5] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,1,2,1] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm6[2,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm6[2,1,2,3] -; AVX512-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX512-NEXT: # ymm7 = mem[2,2,2,2] -; AVX512-NEXT: vpshufd $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[1,2,3,3,5,6,7,7] -; AVX512-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = mem[2,1,2,3] -; AVX512-NEXT: vpermq $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-NEXT: # ymm5 = mem[3,3,3,3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm9 -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm23 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,0,1] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2} -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm25[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm21, %zmm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm24, %zmm24 -; AVX512-NEXT: vmovdqa32 %zmm4, %zmm24 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm27, %zmm4 -; AVX512-NEXT: movw $-28087, %ax # imm = 0x9249 -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovdqa32 %zmm4, %zmm24 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm29, %zmm21 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm26 & (zmm21 ^ zmm24)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm31[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm30, %zmm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm14 -; AVX512-NEXT: vmovdqa32 %zmm4, %zmm14 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm4 -; AVX512-NEXT: vmovdqa32 %zmm4, %zmm14 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm22, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm26 & (zmm4 ^ zmm14)) -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX512-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = mem[2,2,2,2] -; AVX512-NEXT: vpshufd $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = mem[1,2,3,3,5,6,7,7] -; AVX512-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = mem[2,1,2,3] -; AVX512-NEXT: vpermq $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload -; AVX512-NEXT: # ymm11 = mem[3,3,3,3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-NEXT: vmovdqa64 %ymm23, %ymm7 -; AVX512-NEXT: vpshufb %ymm7, %ymm13, %ymm14 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1} -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm3[2,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,3,2,3,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm3[2,1,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm3 -; AVX512-NEXT: vmovdqa32 %zmm3, %zmm2 {%k2} +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,3,2,3,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm27 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm3[0,0,2,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm3, %ymm24 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm19 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm12[0,0,2,1] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm3[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpbroadcastq %xmm15, %ymm22 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm3[0,0,2,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm26 & (zmm3 ^ zmm2)) -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm29 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm2[0,0,2,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm5, %ymm5 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm3[0,0,2,1] +; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k3} +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm23 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm8 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,1,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm9, %ymm9 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm13[0,1,2,3],zmm5[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpbroadcastq %xmm13, %ymm13 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm31[0],zero,xmm31[1],zero,xmm31[2],zero,xmm31[3],zero +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[2,1,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512-NEXT: vmovdqa32 %zmm6, %zmm8 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm15, %zmm6 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[2,1,3,3,4,5,6,7] +; AVX512-NEXT: vmovdqa32 %zmm14, %zmm5 {%k2} +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpbroadcastq %xmm14, %ymm14 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4] ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm2 & (zmm6 ^ zmm1)) +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX512-NEXT: vmovdqa32 %zmm6, %zmm8 {%k2} -; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm16, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm26 & (zmm6 ^ zmm8)) -; AVX512-NEXT: vpbroadcastq %xmm27, %ymm8 -; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm8, %zmm8 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-NEXT: vmovdqa32 %zmm8, %zmm7 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm24, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm16 & (zmm8 ^ zmm7)) -; AVX512-NEXT: vpbroadcastq %xmm29, %ymm17 -; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm17, %zmm17 -; AVX512-NEXT: vmovdqa32 %zmm17, %zmm19 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm16 & (zmm5 ^ zmm19)) -; AVX512-NEXT: vpbroadcastq %xmm10, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512-NEXT: vmovdqa32 %zmm10, %zmm18 {%k1} -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm16 & (zmm9 ^ zmm18)) -; AVX512-NEXT: vpbroadcastq %xmm14, %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX512-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm31, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm21 & (zmm18 ^ zmm11)) +; AVX512-NEXT: vpbroadcastq %xmm19, %ymm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm11, %zmm11 +; AVX512-NEXT: vmovdqa32 %zmm11, %zmm30 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm22, %zmm11 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm20 & (zmm11 ^ zmm30)) +; AVX512-NEXT: vpbroadcastq %xmm23, %ymm19 +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm9 +; AVX512-NEXT: vmovdqa32 %zmm9, %zmm24 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm13, %zmm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm21, %zmm20, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm24)) +; AVX512-NEXT: vpbroadcastq %xmm7, %ymm7 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqa32 %zmm7, %zmm8 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8)) +; AVX512-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm0)) +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm5 {%k1} +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm5)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, 576(%rax) -; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm27, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 576(%rax) ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vmovaps %zmm0, 64(%rax) ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 704(%rax) -; AVX512-NEXT: addq $744, %rsp # imm = 0x2E8 +; AVX512-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm16, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 640(%rax) +; AVX512-NEXT: addq $1032, %rsp # imm = 0x408 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride6_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm8 ; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7] -; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm24, %zmm0 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm8 {%k1} +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm30 = [0,9,2,3,8,5,6,11] +; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm30, %ymm12 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,10,10,10,10] +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm10 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm12 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,1,2,3,11,11,11,11] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm11 +; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [12,1,2,13,4,5,14,7] +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm23, %ymm7 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm8, %zmm29 +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm16 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm7 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm7 +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm5 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %ymm8, %ymm23, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm8 +; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm7 +; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm5 +; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [8,8,0,9,0,1,0,1] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512-FCP-NEXT: kmovw %eax, %k2 +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm26, %zmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,1,8,3,4,9,6,7] +; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm25, %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm25, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,8,8,0,9] +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,0,3,10,0,10,11] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,8,8,0,9] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm26 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm13, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm0 -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm4 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm23, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,2,3,10,9,10,11] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [8,8,0,9,0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 -; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512-FCP-NEXT: kmovw %eax, %k2 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,1,8,3,4,9,6,7] -; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm3 -; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm3 {%k2} -; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm3 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm4 -; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm4 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] -; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm3 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm13, %zmm0 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[12],ymm10[12],ymm7[13],ymm10[13],ymm7[14],ymm10[14],ymm7[15],ymm10[15] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm1 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm12 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm1, %ymm21 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm3 -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm25 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm13 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm23 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm23, %zmm31 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm15 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm15 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm0 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm27, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm22 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm25, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm8 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm8 {%k2} -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm8, %ymm27 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm8 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm27, %zmm23 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,0,2,1,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[8],ymm10[8],ymm7[9],ymm10[9],ymm7[10],ymm10[10],ymm7[11],ymm10[11] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm4 -; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] -; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm1 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm28 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,0,10,10,0] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm6 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k2} +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm0, %ymm25 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm26, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm17 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm30, %ymm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [8,9,20,11,12,21,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,10,10,10,10] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm3 -; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm7 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm29 -; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 -; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm10 -; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm10 {%k1} +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[2],mem[2],ymm4[3],mem[3],ymm4[8],mem[8],ymm4[9],mem[9],ymm4[10],mem[10],ymm4[11],mem[11] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm4 +; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm30, %ymm1 ; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11] -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm12 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11] -; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm30, %zmm12 -; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512-FCP-NEXT: vpermi2d %ymm5, %ymm12, %ymm6 -; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm5)) +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm1 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[2],mem[2],ymm4[3],mem[3],ymm4[8],mem[8],ymm4[9],mem[9],ymm4[10],mem[10],ymm4[11],mem[11] +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm7, %ymm30 +; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm30, %zmm4 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm5 +; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm7 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm4)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 448(%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm1 & (zmm7 ^ zmm29)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm28)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm23)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm0 & (zmm9 ^ zmm22)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm25)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm1 & (zmm6 ^ zmm17)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm1 & (zmm12 ^ zmm24)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm2 & (zmm15 ^ zmm31)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3 ^ (zmm2 & (zmm3 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 512(%rax) -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm20 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ mem)) -; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ mem)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 704(%rax) -; AVX512-FCP-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & ~zmm9) | zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ mem)) +; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm16 = zmm16 ^ (zmm8 & (zmm16 ^ mem)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride6_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $1400, %rsp # imm = 0x578 +; AVX512DQ-NEXT: subq $1816, %rsp # imm = 0x718 ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm4 -; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm31 +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm30 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[0,1,2,3],zmm2[0,1,0,1] ; AVX512DQ-NEXT: movw $9362, %ax # imm = 0x2492 ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm19 {%k1} ; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4 ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm2 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] @@ -9220,19 +9185,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX512DQ-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,2,1] +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm5 +; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm3[0,1,2,3],zmm2[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm18 {%k1} +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] @@ -9249,10 +9215,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9265,8 +9231,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm3[0,1,2,3],zmm1[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm21 {%k1} +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[0,1,2,3],zmm1[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm1 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] @@ -9281,795 +9248,814 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm8 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,1,2,1] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm18 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm25 = zmm1[0,1,2,3],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm25 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,3,3,4,5,6,7] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[2,1,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm9 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm16 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm5 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm3 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm6 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm5 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm5[4],ymm3[5],ymm5[5],ymm3[6],ymm5[6],ymm3[7],ymm5[7],ymm3[12],ymm5[12],ymm3[13],ymm5[13],ymm3[14],ymm5[14],ymm3[15],ymm5[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,2,3,3,5,6,7,7] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm26 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm26[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm1 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm1 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %ymm28 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm24 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm2 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm10 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm15 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm15[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm15[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-NEXT: vmovdqa64 32(%rsi), %ymm29 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm1[2,1,2,3,6,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm25 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm1 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,1,2,3,6,5,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm10 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[8],ymm0[8],ymm10[9],ymm0[9],ymm10[10],ymm0[10],ymm10[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm14 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[2],ymm0[2],ymm14[3],ymm0[3],ymm14[8],ymm0[8],ymm14[9],ymm0[9],ymm14[10],ymm0[10],ymm14[11],ymm0[11] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm10 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[2,1,2,3,6,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[2,1,2,3,6,5,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,1,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm22 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512DQ-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm11 -; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm11[0],ymm13[0],ymm11[1],ymm13[1],ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[8],ymm13[8],ymm11[9],ymm13[9],ymm11[10],ymm13[10],ymm11[11],ymm13[11] -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm22, %zmm11 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm1[4],ymm10[4],ymm1[5],ymm10[5],ymm1[6],ymm10[6],ymm1[7],ymm10[7],ymm1[12],ymm10[12],ymm1[13],ymm10[13],ymm1[14],ymm10[14],ymm1[15],ymm10[15] -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[8],ymm10[8],ymm1[9],ymm10[9],ymm1[10],ymm10[10],ymm1[11],ymm10[11] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm30 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm16, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] +; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm22 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm11, %xmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm26[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX512DQ-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-NEXT: kmovw %eax, %k2 -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm7, %zmm6, %zmm11 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm11)) -; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm1, %zmm0 {%k2} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm30 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm1, %zmm27 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm27 & (zmm30 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm20[2,2,2,2] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6 -; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm3 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm3[2,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm22, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[1,1,1,1] -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[8],ymm6[8],ymm3[9],ymm6[9],ymm3[10],ymm6[10],ymm3[11],ymm6[11] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,2,2,3] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm16 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm29[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm11 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm6, %zmm5, %zmm0 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm7, %zmm0 {%k1} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm8, %zmm29 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm31 & (zmm29 ^ zmm0)) -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm13[4],ymm15[4],ymm13[5],ymm15[5],ymm13[6],ymm15[6],ymm13[7],ymm15[7],ymm13[12],ymm15[12],ymm13[13],ymm15[13],ymm13[14],ymm15[14],ymm13[15],ymm15[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7] +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm13[0],ymm15[0],ymm13[1],ymm15[1],ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[8],ymm15[8],ymm13[9],ymm15[9],ymm13[10],ymm15[10],ymm13[11],ymm15[11] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm8 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm3 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm19[2,1,2,3] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11] -; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[12],ymm7[12],ymm8[13],ymm7[13],ymm8[14],ymm7[14],ymm8[15],ymm7[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[3,3,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm17[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm5[1,2,3,3,5,6,7,7] -; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm22, %zmm1 -; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm3 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm5[2,2,2,3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm2[2,1,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm6 +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm2 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm14 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm14[2,1,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm22, %zmm12 -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm15 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm14[2,2,2,3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm14[2,1,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm4[1,1,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm10[2,1,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm24[2,1,2,3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm5 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm7 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm10[3,3,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm23[2,2,2,2] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm20 = ymm13[1,2,3,3,5,6,7,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm14[1,1,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm31[2,2,2,3] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm4[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm4[2,2,2,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm9[1,1,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero -; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm22 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm5, %xmm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm5[0,1,0,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,2,2,2] -; AVX512DQ-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm11 = mem[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,2,2,3] -; AVX512DQ-NEXT: vpbroadcastq %xmm16, %ymm5 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm16 {%k2} # 32-byte Folded Reload -; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 {%k2} # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2 -; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm21 {%k2} # 32-byte Folded Reload -; AVX512DQ-NEXT: vpbroadcastq %xmm30, %ymm2 -; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm25 {%k2} # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm2 & (zmm5 ^ zmm16)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm30 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm18)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm18 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm2 & (zmm18 ^ zmm21)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm16 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm2 & (zmm16 ^ zmm25)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm21 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm2 = mem[2,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm25, %zmm21 {%k2} +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm3[2,2,2,2] +; AVX512DQ-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm3 = mem[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm3[0,1,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm13[1,1,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,3] +; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 {%k2} # 32-byte Folded Reload +; AVX512DQ-NEXT: vpbroadcastq %xmm9, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 {%k2} # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero +; AVX512DQ-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 {%k2} # 32-byte Folded Reload +; AVX512DQ-NEXT: vpbroadcastq %xmm22, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm31 {%k2} # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm22 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm18 & (zmm22 ^ zmm19)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm18, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm4 & (zmm19 ^ zmm3)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm4 & (zmm3 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm4 & (zmm13 ^ zmm31)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm4 = mem[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm1, %zmm18 {%k2} ; AVX512DQ-NEXT: movw $-28087, %ax # imm = 0x9249 ; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm27, %zmm2, %zmm21 {%k3} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm20, %zmm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm17[2,2,2,3] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm19, %zmm6 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm14, %zmm1 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm7, %zmm13, %zmm1 {%k1} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm3, %zmm4, %zmm12 {%k2} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm17 = mem[2,1,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm9[3,3,3,3] -; AVX512DQ-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm19 = mem[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm10[1,2,3,3,5,6,7,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm31 & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm9, %zmm1, %zmm12 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm9 = mem[2,3,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 {%k3} # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm5 = mem[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm5, %zmm1, %zmm4 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm28, %zmm1, %zmm4 {%k3} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm5 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm20, %zmm25, %zmm5 {%k2} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm21, %zmm23, %zmm1 {%k2} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm14, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm15, %zmm16, %zmm8 {%k2} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm10, %zmm11, %zmm8 {%k1} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm7, %zmm6 {%k2} +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm8)) +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm7[2,2,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,1,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm8, %zmm6 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm8 = mem[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm8[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm31 & (zmm9 ^ zmm12)) -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm12 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm13 = ymm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm0[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX512DQ-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm14 = mem[2,1,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3] +; AVX512DQ-NEXT: vpermq $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm12 = mem[2,1,2,3] ; AVX512DQ-NEXT: vpermq $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm15 = mem[3,3,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[1,2,3,3,5,6,7,7] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm4 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm24, %zmm27, %zmm6 {%k3} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm17, %zmm17 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm25[2,2,2,3] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm20, %zmm19, %zmm17 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm11, %zmm17 {%k3} -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm8 +; AVX512DQ-NEXT: vpermq $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm16 = mem[2,2,2,2] +; AVX512DQ-NEXT: vpshufd $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm17 = mem[1,2,3,3,5,6,7,7] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm9, %ymm7 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm10, %zmm12, %zmm8 {%k2} -; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm8 {%k3} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm4 & (zmm0 ^ zmm21)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm28, %zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm6)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm13, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm17)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm4 & (zmm3 ^ zmm8)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm27 & (zmm8 ^ zmm6)) +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[2,3,2,3,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm21 # 64-byte Folded Reload +; AVX512DQ-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm1 = mem[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm20, %zmm23, %zmm5 {%k3} +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm17[2,2,2,3] +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm15, %zmm16, %zmm12 {%k2} +; AVX512DQ-NEXT: vinserti32x8 $1, %ymm7, %zmm10, %zmm12 {%k3} +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm18)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm29, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm15 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm10 & (zmm15 ^ zmm4)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm11, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm5)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm10 & (zmm4 ^ zmm12)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & ~zmm27) | zmm21 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm29, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 704(%rax) -; AVX512DQ-NEXT: addq $1400, %rsp # imm = 0x578 +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm19, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm22, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 704(%rax) +; AVX512DQ-NEXT: addq $1816, %rsp # imm = 0x718 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0 +; AVX512DQ-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm8 ; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm24, %zmm0 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm8 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm30 = [0,9,2,3,8,5,6,11] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm12 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm30, %ymm12 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm9 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm8 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,10,10,10,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm14, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm10 = ymm10[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm10[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,1,2,3,11,11,11,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm14, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [12,1,2,13,4,5,14,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm7 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm23, %ymm7 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [8,21,10,11,20,13,14,23] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm17 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm28, %zmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [2,2,0,3,10,0,10,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm8, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm5, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm5 = ymm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm8[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm7 = ymm9[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm9[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm14, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm8, %ymm23, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm8 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm28, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,10,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [8,8,0,9,0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 +; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k2} +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [16,9,10,17,12,13,18,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm26, %zmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,1,8,3,4,9,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm25, %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm25, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,0,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,0,3,10,0,10,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,0,8,8,0,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm26 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm13, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm23, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,2,2,3,10,9,10,11] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm30, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [8,8,0,9,0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492 -; AVX512DQ-FCP-NEXT: kmovw %eax, %k2 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,1,8,3,4,9,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm4, %zmm3 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero -; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm27, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [0,0,0,0,8,8,0,9] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[8],ymm4[8],ymm5[9],ymm4[9],ymm5[10],ymm4[10],ymm5[11],ymm4[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[12],ymm6[12],ymm7[13],ymm6[13],ymm7[14],ymm6[14],ymm7[15],ymm6[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm21, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm24, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm22, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm13, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm7[4],ymm10[4],ymm7[5],ymm10[5],ymm7[6],ymm10[6],ymm7[7],ymm10[7],ymm7[12],ymm10[12],ymm7[13],ymm10[13],ymm7[14],ymm10[14],ymm7[15],ymm10[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm23, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm12 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm1, %ymm21 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm13 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm23 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm0 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm23, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm15 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm27, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm20 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm25, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm29, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm8 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm8, %ymm27 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm3 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm31, %zmm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm27, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,0,2,1,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[2],ymm10[2],ymm7[3],ymm10[3],ymm7[8],ymm10[8],ymm7[9],ymm10[9],ymm7[10],ymm10[10],ymm7[11],ymm10[11] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm1 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm30 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm1 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm28 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,0,0,1,0,10,10,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm6 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm11, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k2} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero +; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm0, %ymm25 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,1,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm26, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm6 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm27, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm25, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm30, %ymm2 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [8,9,20,11,12,21,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,1,0,1,10,10,10,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm7 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm7 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm29 -; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[2],mem[2],ymm4[3],mem[3],ymm4[8],mem[8],ymm4[9],mem[9],ymm4[10],mem[10],ymm4[11],mem[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3 -; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm30, %ymm1 ; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm30, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5 -; AVX512DQ-FCP-NEXT: vpermi2d %ymm5, %ymm12, %ymm6 -; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm27, %zmm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4 -; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm13, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm14, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[2],mem[2],ymm4[3],mem[3],ymm4[8],mem[8],ymm4[9],mem[9],ymm4[10],mem[10],ymm4[11],mem[11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[2],mem[2],ymm7[3],mem[3],ymm7[8],mem[8],ymm7[9],mem[9],ymm7[10],mem[10],ymm7[11],mem[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm7, %ymm30 +; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm30, %zmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm5 +; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm14, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm4)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 448(%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm1 & (zmm7 ^ zmm29)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 640(%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm28)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm23)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm0 & (zmm9 ^ zmm22)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm25)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 448(%rax) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm1 & (zmm6 ^ zmm17)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm1 & (zmm12 ^ zmm24)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm2 & (zmm15 ^ zmm31)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm2 & (zmm3 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 512(%rax) +; AVX512DQ-FCP-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm1 # 64-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 512(%rax) -; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm20 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ mem)) -; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ mem)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 704(%rax) -; AVX512DQ-FCP-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & ~zmm9) | zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 640(%rax) +; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ mem)) +; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm16 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm16 = zmm16 ^ (zmm8 & (zmm16 ^ mem)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax) +; AVX512DQ-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index fafb69be0d380..32f5f7a713a1f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -645,8 +645,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 & (zmm2 | zmm1) +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm0 & ~zmm1) | zmm2 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512-NEXT: vmovq %xmm0, 48(%rax) @@ -727,8 +729,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 & (zmm2 | zmm1) +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm0 & ~zmm1) | zmm2 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 ; AVX512DQ-NEXT: vmovq %xmm0, 48(%rax) @@ -1369,27 +1373,32 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,2,0] ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512-NEXT: vporq %zmm7, %zmm9, %zmm7 -; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm10 & (zmm9 | zmm7) +; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm11[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm11[u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,4,5,12,13],zero,zero,ymm11[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm11[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & mem) | zmm11 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm12 & ~zmm10) | zmm9 +; AVX512-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandn %ymm9, %ymm11, %ymm9 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] -; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7 -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2)) +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm8[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm8[u,u,u,u] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm2) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm9) ; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1402,7 +1411,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1429,27 +1438,32 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vporq %zmm7, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm9 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm10 & (zmm9 | zmm7) +; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm11[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm11[u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,4,5,12,13],zero,zero,ymm11[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm11[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm13 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & mem) | zmm11 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm12 & ~zmm10) | zmm9 +; AVX512-FCP-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandn %ymm9, %ymm11, %ymm9 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u],zero,zero,zero,zero,ymm6[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[20,21,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm8[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm8[u,u,u,u] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm6) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm9) ; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1462,7 +1476,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1492,27 +1506,32 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,2,0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,8,9],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,28,29,20,21] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512DQ-NEXT: vporq %zmm7, %zmm9, %zmm7 -; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm10 & (zmm9 | zmm7) +; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm11[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm11[u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,4,5,12,13],zero,zero,ymm11[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm11[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512DQ-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & mem) | zmm11 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm12 & ~zmm10) | zmm9 +; AVX512DQ-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandn %ymm9, %ymm11, %ymm9 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,1,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u],zero,zero,zero,zero,ymm2[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[20,21,28,29,u,u,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm2, %ymm7, %ymm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (mem & (ymm8 ^ ymm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm8[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm8[u,u,u,u] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm2) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm9) ; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm2 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1525,7 +1544,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1552,27 +1571,32 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[20,21,24,25] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 -; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm11 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm10 & (zmm9 | zmm7) +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm11[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm11[u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,4,5,12,13],zero,zero,ymm11[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm11[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 +; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm12 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & mem) | zmm11 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm10, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm12 & ~zmm10) | zmm9 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%r10), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm11, %ymm9 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u],zero,zero,zero,zero,ymm6[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[20,21,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm9[1,3,3,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,ymm7[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm7[u,u,u,u,u,u,u,u,28,29,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm8 = (m32bcst & ~ymm8) | ymm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (mem & (ymm8 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm8[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm8[u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm6) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm9) ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] @@ -1585,7 +1609,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2819,524 +2843,588 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-LABEL: store_i16_stride7_vf16: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-NEXT: vmovdqa (%r9), %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512-NEXT: vmovdqa (%r8), %ymm3 +; AVX512-NEXT: vmovdqa (%r9), %ymm4 ; AVX512-NEXT: vmovdqa (%rax), %ymm13 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vporq %ymm1, %ymm4, %ymm16 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u],zero,zero,ymm10[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[16,17,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,14,15],zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm9[u,u,u,u,u,u,u,u] +; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm17 ; AVX512-NEXT: vmovdqa (%rcx), %xmm14 ; AVX512-NEXT: vmovdqa (%rdx), %xmm15 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-NEXT: vporq %ymm1, %ymm4, %ymm17 -; AVX512-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u] -; AVX512-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm18 +; AVX512-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u] +; AVX512-NEXT: vporq %ymm1, %ymm8, %ymm19 +; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm21 +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7,8,9],ymm12[10],ymm8[11,12],ymm12[13],ymm8[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20 -; AVX512-NEXT: vprold $16, %xmm4, %xmm11 -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512-NEXT: vprold $16, %xmm7, %xmm11 +; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[1,1,2,3] ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,1,1,0,0,0,0,16,17,0,0,18,0] -; AVX512-NEXT: vpermi2d %zmm11, %zmm12, %zmm18 -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,1,1,1,0,0,16,0,16,17,0,0,18,19] +; AVX512-NEXT: vpermi2d %zmm11, %zmm12, %zmm16 +; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] ; AVX512-NEXT: vmovdqa (%r9), %xmm11 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512-NEXT: vmovdqa (%r8), %xmm15 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] +; AVX512-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 -; AVX512-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] +; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512-NEXT: vprold $16, %ymm4, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6,7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13,14,15] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm9 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm23[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd %zmm13, %zmm3, %zmm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,4,0,0,5,5,0,4,5,0,0,6,6,0,6] +; AVX512-NEXT: vpermd %zmm13, %zmm4, %zmm4 +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,3,3,3,6,7,7,7] ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (mem & (zmm9 ^ zmm7)) -; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9)) -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 -; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (mem & (zmm7 ^ zmm4)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm7)) -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm18 ^ (mem & (zmm0 ^ zmm18)) -; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm8)) -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm6)) -; AVX512-NEXT: vmovdqa %ymm0, 192(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm8 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm13, %zmm8, %zmm8 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm9 & ~zmm15) | zmm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm8 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm15)) +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm8 +; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm20[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm10, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm8 & ~zmm10) | zmm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm8, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm4 & ~zmm8) | zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm10)) +; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm2[0,0,1,1,4,4,5,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm16 & ~zmm2) | zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm0, %zmm7, %zmm4 +; AVX512-NEXT: vpbroadcastd (%rax), %ymm7 +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm7 & ~zmm0) | zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm11 & (ymm5 ^ ymm1)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,1,3,2] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) +; AVX512-NEXT: vmovdqa %ymm1, 192(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rcx) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf16: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm16 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm0, %ymm5, %ymm18 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm4[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-FCP-NEXT: vporq %ymm0, %ymm5, %ymm19 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm14 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512-FCP-NEXT: vporq %ymm0, %ymm8, %ymm20 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6,7,8],ymm0[9],ymm9[10,11],ymm0[12],ymm9[13,14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,1,3,2,8,10,10,11] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 ; AVX512-FCP-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7,8,9],ymm0[10],ymm13[11,12],ymm0[13],ymm13[14,15] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7,8,9,10],ymm0[11],ymm13[12,13],ymm0[14],ymm13[15] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,0,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 -; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9] -; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm5, %zmm13 +; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,1,0,8,8,0,9] +; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm15 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm5[0,0,2,1] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm5[0,0,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2,3],xmm5[4],xmm8[5,6],xmm5[7] +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm12 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm12 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,8,9,9,0] +; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[16,17,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm9 ^ (mem & (zmm0 ^ zmm9)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm15 ^ (mem & (zmm0 ^ zmm15)) -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm20)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (mem & (ymm1 ^ ymm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm16 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (zmm2 & ~zmm16) | zmm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm5, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm16)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm10[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm5, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm9 & ~zmm5) | zmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm2, %zmm13, %zmm9 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm4 & ~zmm2) | zmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm5)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm11[0,0,1,1,4,4,5,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm15 & ~zmm5) | zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm4, %zmm14, %zmm8 +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm9 & ~zmm4) | zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm6 ^ (ymm7 & (ymm3 ^ ymm6)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rcx) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf16: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm4 ; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vporq %ymm1, %ymm4, %ymm16 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u],zero,zero,ymm10[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,14,15],zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm9[u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm17 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm14 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-NEXT: vporq %ymm1, %ymm4, %ymm17 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u] -; AVX512DQ-NEXT: vporq %ymm5, %ymm10, %ymm19 -; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm5 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm18 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17],zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u] +; AVX512DQ-NEXT: vporq %ymm1, %ymm8, %ymm19 +; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm1 +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm21 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7,8,9],ymm12[10],ymm8[11,12],ymm12[13],ymm8[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20 -; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm11 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3] +; AVX512DQ-NEXT: vprold $16, %xmm7, %xmm11 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[1,1,2,3] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,0,0,1,1,0,0,0,0,16,17,0,0,18,0] -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm12, %zmm18 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,1,1,1,0,0,16,0,16,17,0,0,18,19] +; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm12, %zmm16 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15] ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7] +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm7[0,1,2],ymm1[3],ymm7[4,5],ymm1[6],ymm7[7,8,9,10],ymm1[11],ymm7[12,13],ymm1[14],ymm7[15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm0, %zmm15 -; AVX512DQ-NEXT: vprold $16, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7 +; AVX512DQ-NEXT: vprold $16, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm7 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6,7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm9 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm23[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd %zmm13, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,4,0,0,5,5,0,4,5,0,0,6,6,0,6] +; AVX512DQ-NEXT: vpermd %zmm13, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,3,3,3,6,7,7,7] ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (mem & (zmm9 ^ zmm7)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (mem & (zmm7 ^ zmm4)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm7)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm18 ^ (mem & (zmm0 ^ zmm18)) -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm8)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm6)) -; AVX512DQ-NEXT: vmovdqa %ymm0, 192(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rcx) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm13, %zmm8, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm13, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm9 & ~zmm15) | zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm10, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 | (zmm8 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm15)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm20[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm10, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm8 & ~zmm10) | zmm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm8, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm4 & ~zmm8) | zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm10)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm2[0,0,1,1,4,4,5,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm16 & ~zmm2) | zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm0, %zmm7, %zmm4 +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm7 +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm7 & ~zmm0) | zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm11 & (ymm5 ^ ymm1)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,1,3,2] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm5)) +; AVX512DQ-NEXT: vmovdqa %ymm1, 192(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rcx) +; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rcx) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf16: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm5, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm4[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[12,13,14,15],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm3[u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm5, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm14 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u] +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm8, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3],ymm0[4],ymm9[5,6,7,8],ymm0[9],ymm9[10,11],ymm0[12],ymm9[13,14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,1,3,2,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm1[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7,8,9],ymm0[10],ymm13[11,12],ymm0[13],ymm13[14,15] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,0,2,1,4,4,6,5] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7,8,9,10],ymm0[11],ymm13[12,13],ymm0[14],ymm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,0,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm5, %zmm13 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,1,1,0,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm15 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm5[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm5[0,0,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm11[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1],xmm8[2,3],xmm5[4],xmm8[5,6],xmm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1] ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] -; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm12 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm0, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[16,17,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,3] +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0] +; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm9 ^ (mem & (zmm0 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm15 ^ (mem & (zmm0 ^ zmm15)) -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm20)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (mem & (ymm1 ^ ymm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm8, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm16 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = (zmm2 & ~zmm16) | zmm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm5, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm10[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm5, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm9 & ~zmm5) | zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm13, %zmm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm4 & ~zmm2) | zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm11[0,0,1,1,4,4,5,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm15 & ~zmm5) | zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm4, %zmm14, %zmm8 +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm9 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm10 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm9 & ~zmm4) | zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm6 ^ (ymm7 & (ymm3 ^ ymm6)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm0 ^ (mem & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm3)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 192(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rcx) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rcx) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -5859,1247 +5947,1309 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride7_vf32: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $664, %rsp # imm = 0x298 -; AVX512-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm13, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] +; AVX512-NEXT: subq $808, %rsp # imm = 0x328 +; AVX512-NEXT: vmovdqa (%r9), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa (%r8), %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] ; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm18 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-NEXT: vpshufb %ymm2, %ymm4, %ymm3 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3] -; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512-NEXT: vpermi2d %zmm3, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm3 -; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm4 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm3 +; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm4 ; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm4 +; AVX512-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX512-NEXT: vpshufb %ymm6, %ymm12, %ymm3 +; AVX512-NEXT: vpshufb %ymm14, %ymm13, %ymm4 ; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] -; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm24 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] +; AVX512-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm8 +; AVX512-NEXT: vmovdqa (%r8), %xmm9 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] +; AVX512-NEXT: vpermi2d %zmm10, %zmm3, %zmm26 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa (%rax), %ymm6 -; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vpandn %ymm3, %ymm10, %ymm10 +; AVX512-NEXT: vmovdqa (%rax), %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm14 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,0,1,1] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,0,2,1] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm6[0,2,2,3] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm6[0,2,2,3] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm6[2,1,3,3] +; AVX512-NEXT: vprold $16, %ymm12, %ymm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,1,3,2] +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm6[2,2,2,3] +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-NEXT: vprold $16, %xmm9, %xmm10 +; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm4[1],xmm15[2,3],xmm4[4],xmm15[5,6],xmm4[7] ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa %ymm9, %ymm8 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512-NEXT: vprold $16, %xmm5, %xmm6 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm17 = xmm4[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm6[2,1,3,3] +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm4[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm7[2,2,2,3] +; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512-NEXT: # xmm4 = mem[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,1,3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,3,3,3,6,7,7,7] ; AVX512-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm25 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-NEXT: vprold $16, %xmm3, %xmm3 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512-NEXT: vmovdqa %ymm10, %ymm14 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8,9,10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,0,1,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = mem[2,1,3,2] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm18[2,1,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,2,3] -; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; AVX512-NEXT: vprold $16, %ymm14, %ymm5 -; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] -; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm15 -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm28 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm29 & (zmm28 ^ zmm15)) -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm29 & (zmm0 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512-NEXT: vprold $16, %ymm14, %ymm2 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm3 -; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512-NEXT: # ymm10 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] -; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512-NEXT: # ymm18 = mem[2,2,2,3] -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512-NEXT: # ymm20 = mem[2,1,3,2] -; AVX512-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512-NEXT: # ymm26 = mem[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm23[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm2 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm28)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm9 & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm2, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm26[0,1,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm2 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm4)) -; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm2 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) -; AVX512-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm24)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd %zmm6, %zmm5, %zmm5 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm27, %zmm3, %zmm3 +; AVX512-NEXT: vpandnq %zmm21, %zmm27, %zmm21 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm28 & (zmm21 | zmm3) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm3 +; AVX512-NEXT: vpandnq %zmm3, %zmm28, %zmm22 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm3 & (zmm22 | zmm21) +; AVX512-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] +; AVX512-NEXT: vpermd %zmm14, %zmm21, %zmm21 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm21 & ~zmm3) | zmm22 +; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm17[0,1,1,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm11, %zmm16 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm16 ^ (zmm27 & (zmm15 ^ zmm16)) +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm10 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm16 +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 +; AVX512-NEXT: vpandq %zmm28, %zmm10, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & ~zmm17) | zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm10 # 64-byte Folded Reload +; AVX512-NEXT: # zmm10 = (zmm10 & zmm19) | mem +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm11, %zmm17 # 32-byte Folded Reload +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512-NEXT: # ymm18 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] +; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512-NEXT: # ymm21 = mem[0,0,1,1] +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm30[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm29[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm17 ^ (zmm27 & (zmm9 ^ zmm17)) +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm18, %zmm11 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm21, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (zmm27 & (zmm12 ^ zmm11)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 64-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm11, %zmm11 +; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm18 +; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm21 +; AVX512-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = (zmm18 & ~zmm11) | zmm17 +; AVX512-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm17 +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm16 & (zmm11 ^ zmm26)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm10 ^ (mem & (zmm9 ^ zmm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm12)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm10, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm10)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm24[0,1,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm10 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm12)) +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm25, %zmm10 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm12 ^ (mem & (zmm10 ^ zmm12)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-NEXT: vpermd %zmm14, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512-NEXT: addq $664, %rsp # imm = 0x298 +; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512-NEXT: addq $808, %rsp # imm = 0x328 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf32: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX512-FCP-NEXT: subq $360, %rsp # imm = 0x168 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm4 ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm7 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm5 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm8 +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm3 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,10,0,11,0] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm25 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,0,8,8,9] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,2,2,3,8,0,9,0] +; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm23 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm10 +; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm22 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm7 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm26 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm26 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3],xmm3[4],xmm9[5,6],xmm3[7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,0,9,0,0,0,1,1] ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm29 -; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] -; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,0] +; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm28 +; AVX512-FCP-NEXT: vprold $16, %ymm13, %ymm6 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,0,11,10] +; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11] +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpandnq %ymm6, %ymm31, %ymm6 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm30 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,3,10,10,11,11] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm1 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7,8,9],ymm7[10],ymm0[11,12],ymm7[13],ymm0[14,15] -; AVX512-FCP-NEXT: vprold $16, %ymm13, %ymm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] +; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7,8,9],ymm11[10],ymm4[11,12],ymm11[13],ymm4[14,15] +; AVX512-FCP-NEXT: vprold $16, %ymm8, %ymm11 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2,3],xmm8[4],xmm15[5,6],xmm8[7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,0,9] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] -; AVX512-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 +; AVX512-FCP-NEXT: vprold $16, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm11 +; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 +; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm7, %zmm29, %zmm13 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & ~zmm7) | zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7,8,9],ymm13[10],ymm7[11,12],ymm13[13],ymm7[14,15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm16[0,0,1,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm9[2,2,2,3] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-FCP-NEXT: vpermd %ymm5, %ymm21, %ymm5 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,0,0,0,7,0,0,7] +; AVX512-FCP-NEXT: vpermd %ymm12, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm26 ^ (zmm12 & (zmm7 ^ zmm26)) -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm28)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm11)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm12 & (zmm3 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm17 & (zmm21 ^ zmm25)) -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm26 ^ (zmm12 & (zmm8 ^ zmm26)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm26, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX512-FCP-NEXT: vpandnq %zmm11, %zmm8, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm10 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm12 & (zmm10 ^ zmm7)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & zmm31) | zmm30 +; AVX512-FCP-NEXT: vpandnq %zmm25, %zmm26, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 & (zmm3 | zmm7) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm7 & (zmm11 | zmm3) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm0 & ~zmm7) | zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm20 ^ (zmm26 & (zmm19 ^ zmm20)) +; AVX512-FCP-NEXT: vpandq %zmm3, %zmm22, %zmm0 +; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & ~zmm3) | zmm0 +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm28)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10)) ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm23)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm21)) -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm27)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm6 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm19)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm27)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512-FCP-NEXT: addq $296, %rsp # imm = 0x128 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512-FCP-NEXT: addq $360, %rsp # imm = 0x168 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $664, %rsp # imm = 0x298 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: subq $808, %rsp # imm = 0x328 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm17 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm18 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm3 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20 -; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; AVX512DQ-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3] -; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm4 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm4 ; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm15 -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm15, %ymm4 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm12 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm12, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm14, %ymm13, %ymm4 ; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm14 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] -; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] -; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm24 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3],xmm3[4],xmm8[5,6],xmm3[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3] +; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm3, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0] +; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm3, %zmm26 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm0 -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm6 -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm28 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm29 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm30 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm31 -; AVX512DQ-NEXT: vprold $16, %ymm13, %ymm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] +; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vpandn %ymm3, %ymm10, %ymm10 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm14 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm25 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm8, %xmm8 +; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm23 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,0,1,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm6[0,0,2,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm6[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm6[0,2,2,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm6[0,2,2,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm0 +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm6[2,1,3,3] +; AVX512DQ-NEXT: vprold $16, %ymm12, %ymm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,1,3,2] +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm6[2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm11 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-NEXT: vprold $16, %xmm9, %xmm10 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm8[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7] +; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm4[1],xmm15[2,3],xmm4[4],xmm15[5,6],xmm4[7] ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm17 = xmm4[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm6[2,1,3,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm10, %xmm4 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm4[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm7[2,2,2,3] +; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm4 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,1,3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm1[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7,8,9],ymm4[10],ymm0[11,12],ymm4[13],ymm0[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm30 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm29 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm13[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm25 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm10 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5 -; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7] -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm5 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm26[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm26[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm14 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8,9,10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm0[0,0,1,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3] -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = mem[2,1,3,2] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm6 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm25[0,2,2,3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm18[2,1,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,2,3] -; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm5 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,1,3] -; AVX512DQ-NEXT: vprold $16, %ymm14, %ymm5 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7,8,9],ymm5[10],ymm15[11,12],ymm5[13],ymm15[14,15] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm28 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm29 & (zmm28 ^ zmm15)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm29 & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] +; AVX512DQ-NEXT: vprold $16, %ymm14, %ymm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm3 -; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm10 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] -; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm18 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm20 = mem[2,1,3,2] -; AVX512DQ-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm26 = mem[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm23[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,0,2,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm21[2,1,3,2] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm16[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm28)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm17, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm9 & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm0 -; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm2, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm26[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 | (zmm2 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (mem & (zmm19 ^ zmm4)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (mem & (zmm2 ^ zmm4)) -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm24)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm5, %zmm5 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm27, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpandnq %zmm21, %zmm27, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm28 & (zmm21 | zmm3) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpandnq %zmm3, %zmm28, %zmm22 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm3 & (zmm22 | zmm21) +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm14 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] +; AVX512DQ-NEXT: vpermd %zmm14, %zmm21, %zmm21 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm21 & ~zmm3) | zmm22 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm17[0,1,1,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm11, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm16 ^ (zmm27 & (zmm15 ^ zmm16)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm10 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2] +; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm16 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpandq %zmm28, %zmm10, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm28, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & ~zmm17) | zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm10 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm10 = (zmm10 & zmm19) | mem +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, (%rsp), %zmm11, %zmm17 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm18 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm11 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,3] +; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm21 = mem[0,0,1,1] +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm30[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm29[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm24[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm17 ^ (zmm27 & (zmm9 ^ zmm17)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm18, %zmm11 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm21, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (zmm27 & (zmm12 ^ zmm11)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm17 # 64-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm18 +; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm21 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm21, %zmm18, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = (zmm18 & ~zmm11) | zmm17 +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm11 +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm17 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm16 & (zmm11 ^ zmm26)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm10 ^ (mem & (zmm9 ^ zmm10)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm12)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm10, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm23, %zmm12, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm10)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm24[0,1,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm10 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm12)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm25, %zmm10 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm12 ^ (mem & (zmm10 ^ zmm12)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm10)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm7)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-NEXT: vpermd %zmm14, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rax) -; AVX512DQ-NEXT: addq $664, %rsp # imm = 0x298 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512DQ-NEXT: addq $808, %rsp # imm = 0x328 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $296, %rsp # imm = 0x128 +; AVX512DQ-FCP-NEXT: subq $360, %rsp # imm = 0x168 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm7 -; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm3 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm22 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,10,0,11,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm25 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,0,8,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,2,2,3,8,0,9,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm20 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm19 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm5 -; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm23 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm10 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm9, %zmm22 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3,4],xmm7[5],xmm3[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm7 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm26 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28 +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm26 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3],xmm3[4],xmm9[5,6],xmm3[7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,0,9,0,0,0,1,1] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm29 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3],ymm4[4,5],ymm8[6],ymm4[7,8,9,10],ymm8[11],ymm4[12,13],ymm8[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm28 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm13, %ymm6 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm5[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,3,3,10,0,11,10] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm16 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7,8,9],ymm1[10],ymm9[11,12],ymm1[13],ymm9[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpandnq %ymm6, %ymm31, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm30 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7,8,9],ymm6[10],ymm3[11,12],ymm6[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,3,10,10,11,11] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5,6,7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm16[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7,8,9],ymm7[10],ymm0[11,12],ymm7[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm13, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm7[2],ymm14[3,4],ymm7[5],ymm14[6,7,8,9],ymm7[10],ymm14[11,12],ymm7[13],ymm14[14,15] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm11[2],ymm3[3,4],ymm11[5],ymm3[6,7,8,9],ymm11[10],ymm3[11,12],ymm11[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm21, %zmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm11[2],ymm4[3,4],ymm11[5],ymm4[6,7,8,9],ymm11[10],ymm4[11,12],ymm11[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm8, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7,8,9],ymm11[10],ymm8[11,12],ymm11[13],ymm8[14,15] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2,3],xmm7[4],xmm15[5,6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm15[0],xmm8[1],xmm15[2,3],xmm8[4],xmm15[5,6],xmm8[7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,0,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm7 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm17[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7,8,9],ymm3[10],ymm6[11,12],ymm3[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm17 & (zmm12 ^ zmm24)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,0,0,0,7,0,0,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm8 +; AVX512DQ-FCP-NEXT: vprold $16, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm15, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15] +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm7, %zmm29, %zmm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & ~zmm7) | zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7,8,9],ymm13[10],ymm7[11,12],ymm13[13],ymm7[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm13 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm16[0,0,1,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm9[2,2,2,3] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm21, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm21, %ymm5 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,0,0,0,7,0,0,7] +; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm26 ^ (zmm12 & (zmm7 ^ zmm26)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm13 -; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm13 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm28)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm24[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7,8],ymm7[9],ymm11[10,11],ymm7[12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm19[0,0,1,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[2,2,2,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm16, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm12 & (zmm3 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm17 & (zmm21 ^ zmm25)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm26 ^ (zmm12 & (zmm8 ^ zmm26)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm26, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535] +; AVX512DQ-FCP-NEXT: vpandnq %zmm11, %zmm8, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm0 +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm10 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm12 & (zmm10 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm2 & zmm31) | zmm30 +; AVX512DQ-FCP-NEXT: vpandnq %zmm25, %zmm26, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 & (zmm3 | zmm7) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm7 & (zmm11 | zmm3) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm0 & ~zmm7) | zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm20 ^ (zmm26 & (zmm19 ^ zmm20)) +; AVX512DQ-FCP-NEXT: vpandq %zmm3, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & ~zmm3) | zmm0 +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm28)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10)) ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm23)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm21)) -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm27)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm9, %zmm9 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm6 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm19)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm27)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $296, %rsp # imm = 0x128 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax) +; AVX512DQ-FCP-NEXT: addq $360, %rsp # imm = 0x168 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -12524,326 +12674,327 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX512-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX512-NEXT: vmovdqa 96(%rdx), %ymm15 +; AVX512-NEXT: subq $2808, %rsp # imm = 0xAF8 +; AVX512-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX512-NEXT: vmovdqa 96(%rdx), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm8 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm0, %ymm6, %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm3 -; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm18 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-NEXT: vpshufb %ymm12, %ymm7, %ymm4 -; AVX512-NEXT: vporq %ymm2, %ymm4, %ymm19 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm1, %ymm12, %ymm4 +; AVX512-NEXT: vporq %ymm3, %ymm4, %ymm18 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm5 +; AVX512-NEXT: vporq %ymm3, %ymm5, %ymm19 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm4 -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX512-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm10 +; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512-NEXT: vpshufb %ymm10, %ymm14, %ymm5 +; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa (%r8), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm2 +; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX512-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512-NEXT: vpshufb %ymm15, %ymm4, %ymm10 ; AVX512-NEXT: vpor %ymm10, %ymm9, %ymm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX512-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm9 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,3] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 96(%r9), %ymm9 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm10[3,3,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm10[3,3,3,3] ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,2,2,2] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] +; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm10[2,2,2,2] +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX512-NEXT: vprold $16, %ymm9, %ymm8 -; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 +; AVX512-NEXT: vpshufb %ymm15, %ymm9, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm18, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6)) -; AVX512-NEXT: vmovdqa 96(%r8), %ymm6 -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (mem & (zmm7 ^ zmm0)) +; AVX512-NEXT: vmovdqa 96(%r8), %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,u,u,u,u],zero,zero ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm12) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11) -; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm12) +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm0 & ymm1) +; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6)) -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7)) -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm1 & (ymm7 ^ ymm0)) +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,2,2,2] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm7)) +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[0,1,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX512-NEXT: vpandn %ymm7, %ymm12, %ymm7 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512-NEXT: vpandnq %ymm6, %ymm30, %ymm6 -; AVX512-NEXT: vmovdqa 64(%rax), %ymm7 -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512-NEXT: vmovdqa 96(%rax), %ymm7 +; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX512-NEXT: vpandn %ymm8, %ymm15, %ymm8 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-NEXT: vpbroadcastd 72(%rax), %ymm7 +; AVX512-NEXT: vpandnq %ymm7, %ymm25, %ymm7 +; AVX512-NEXT: vmovdqa 64(%rax), %ymm8 +; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX512-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm18 -; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm16 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,0,1,1] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%rcx), %xmm8 ; AVX512-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,0,2,1] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm12 -; AVX512-NEXT: vpandnq %ymm12, %ymm30, %ymm12 -; AVX512-NEXT: vmovdqa (%rax), %ymm13 -; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm10, %ymm13, %ymm13 -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm6 -; AVX512-NEXT: vmovdqa (%r8), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX512-NEXT: vmovdqa64 %xmm13, %xmm25 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512-NEXT: vmovdqa64 %xmm14, %xmm31 -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512-NEXT: vpandnq %ymm12, %ymm25, %ymm12 +; AVX512-NEXT: vmovdqa (%rax), %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm15 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm15 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm29 +; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,1] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm19 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,2,1] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm15, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,2] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,2,2,3] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512-NEXT: vpshufb %ymm2, %ymm12, %ymm0 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512-NEXT: vpandnq %ymm12, %ymm17, %ymm12 -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8,9,10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512-NEXT: vpshufb %ymm14, %ymm4, %ymm10 -; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpandnq %ymm12, %ymm18, %ymm12 +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8,9,10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vprold $16, %ymm0, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vprold $16, %ymm4, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm6 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512-NEXT: vmovdqa64 %ymm27, %ymm10 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm6 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vprold $16, %ymm5, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] @@ -12855,8 +13006,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] @@ -12864,11 +13015,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] @@ -12879,8 +13030,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12891,896 +13042,939 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm1 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 -; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm2 -; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpandq %zmm15, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512-NEXT: vmovdqa %xmm5, %xmm12 -; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX512-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] +; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512-NEXT: vprold $16, %xmm9, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm1 ^ (zmm0 & (zmm18 ^ zmm1)) +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] -; AVX512-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX512-NEXT: vprold $16, %xmm8, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm14 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm2 ^ (zmm1 & (zmm14 ^ zmm2)) -; AVX512-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] -; AVX512-NEXT: vpermt2d %zmm4, %zmm5, %zmm2 -; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm4 -; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm23 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm18 & (zmm23 ^ zmm2)) -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512-NEXT: vpshufb %xmm12, %xmm15, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3],xmm4[4],xmm6[5,6],xmm4[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm2, %zmm3, %zmm4 -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] +; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm3 +; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5 +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm16 & (zmm3 ^ zmm1)) +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %xmm17, %xmm5 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX512-NEXT: vprold $16, %xmm19, %xmm3 -; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm1 & (zmm7 ^ zmm4)) -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm1 -; AVX512-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm1)) -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512-NEXT: vprold $16, %xmm22, %xmm2 +; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm20[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm0 & (zmm10 ^ zmm3)) +; AVX512-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm30 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm0)) +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb %ymm6, %ymm8, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[1,2,2,3,5,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,3,3,3,7,7,7,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,2,3,3] ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm4[2,1,3,2] +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512-NEXT: vprold $16, %xmm12, %xmm1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,1,3,2] -; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm1 -; AVX512-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,0,2,1,4,4,6,5] -; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512-NEXT: vprold $16, %ymm8, %ymm1 -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,2,2,3,5,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,3,3,3,7,7,7,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] -; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm2[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm4[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm5[2,1,3,2] -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512-NEXT: vprold $16, %xmm11, %xmm2 -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] -; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] -; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm3[0,2,2,3] -; AVX512-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm4 -; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512-NEXT: vpermt2d %zmm3, %zmm17, %zmm2 -; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm2[0,2,2,3] +; AVX512-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX512-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,7,6] +; AVX512-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm13 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm13 +; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm15, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm5 & (zmm13 ^ zmm1)) ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm1[2,1,3,3] +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm16 = xmm1[0,0,1,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm4[0,0,1,1] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm1)) -; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512-NEXT: # ymm1 = mem[2,1,3,2] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm31 & (ymm1 ^ ymm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm30 & (ymm20 ^ ymm1)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload -; AVX512-NEXT: # zmm1 = (zmm1 & zmm30) | mem -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = (zmm4 & zmm30) | mem +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ zmm0)) +; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm31 & (ymm0 ^ ymm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm25 & (ymm21 ^ ymm0)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (zmm0 & zmm25) | mem ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm20 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm25, %zmm4 +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (zmm4 & zmm20) | mem ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm30 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm2 ^ (zmm22 & (zmm30 ^ zmm2)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm1 ^ (zmm2 & (zmm30 ^ zmm1)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm25 # 32-byte Folded Reload ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm22 & (zmm5 ^ zmm1)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm2 & (zmm5 ^ zmm4)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1)) -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm18 & (ymm0 ^ m32bcst)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 # 64-byte Folded Reload -; AVX512-NEXT: # zmm18 = zmm18 | (zmm1 & mem) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm20 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm25 ^ (zmm14 & (zmm20 ^ zmm25)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm0 ^ (zmm3 & (zmm20 ^ zmm0)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm25 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm0 ^ (zmm14 & (zmm25 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm4 ^ (zmm3 & (zmm25 ^ zmm4)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm29 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm0 ^ (mem & (zmm29 ^ zmm0)) +; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm17 & (ymm0 ^ m32bcst)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm2 & (ymm23 ^ ymm0)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 64-byte Folded Reload +; AVX512-NEXT: # zmm24 = zmm24 | (zmm2 & mem) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm0[0,1,2,3],mem[0,1,2,3] -; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = (zmm3 & zmm1) | mem -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm20[0,1,2,3],zmm21[0,1,2,3] +; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm4 +; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512-NEXT: # zmm4 = (zmm4 & zmm0) | mem +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm21[0,1,2,3],zmm23[0,1,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-NEXT: vpandq %zmm14, %zmm0, %zmm0 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm2 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-NEXT: vpermd %zmm2, %zmm21, %zmm20 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm1)) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] +; AVX512-NEXT: vpermd %zmm2, %zmm3, %zmm23 +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm14, %zmm21 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = (zmm23 & ~zmm21) | zmm0 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512-NEXT: vpermd %zmm0, %zmm21, %zmm21 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm4 & (zmm21 ^ zmm1)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm1 & (zmm23 ^ zmm14)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm7)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm1 ^ (zmm14 & (zmm7 ^ zmm1)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512-NEXT: vpermd %zmm1, %zmm3, %zmm3 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ zmm0)) +; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm0 +; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm21 +; AVX512-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq $186, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = (zmm0 & ~zmm5) | mem +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm5 & (zmm17 ^ zmm18)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm5 & (zmm30 ^ zmm10)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm5 ^ (zmm10 & (zmm21 ^ zmm5)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm18, %zmm22 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm5 ^ (zmm10 & (zmm22 ^ zmm5)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-NEXT: vpermd %zmm2, %zmm10, %zmm10 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm6 & (zmm10 ^ zmm5)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm2, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm5 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,0,0,5,5,0,4,5,0,0,6,6,0,6] +; AVX512-NEXT: vpermd %zmm1, %zmm18, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm1 & (zmm10 ^ zmm21)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm1 & (zmm18 ^ zmm22)) ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm28, %zmm25 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm1 ^ (zmm14 & (zmm25 ^ zmm1)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 32-byte Folded Reload -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-NEXT: vpermd %zmm2, %zmm28, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm14)) -; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm29, %zmm2 -; AVX512-NEXT: vpermd %zmm0, %zmm28, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm2)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm25)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm22 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm9 & (zmm22 ^ zmm1)) +; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512-NEXT: # ymm0 = mem[0,1,1,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (zmm9 & (zmm7 ^ zmm0)) -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm12 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm9 & (zmm12 ^ zmm0)) -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm16[0,1,1,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm9 & (zmm5 ^ zmm1)) +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512-NEXT: # ymm8 = mem[2,2,2,3] -; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512-NEXT: # xmm10 = mem[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,2,2] +; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512-NEXT: # xmm11 = mem[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 ; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512-NEXT: # ymm11 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512-NEXT: # ymm25 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX512-NEXT: # xmm4 = mem[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm4[0,0,1,3] -; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512-NEXT: # ymm26 = mem[2,2,2,3] -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] -; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512-NEXT: # ymm28 = mem[2,3,3,3,6,7,7,7] -; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512-NEXT: # ymm29 = mem[0,0,2,1] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512-NEXT: # ymm12 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX512-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] +; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512-NEXT: # ymm16 = mem[0,0,1,1] +; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512-NEXT: # ymm19 = mem[2,2,2,3] +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm8 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] +; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512-NEXT: # ymm21 = mem[2,3,3,3,6,7,7,7] +; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512-NEXT: # ymm26 = mem[0,0,2,1] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm11 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm8)) -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm4 -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm28[2,1,3,2] -; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm31 & (zmm8 ^ zmm4)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm12)) -; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm22 & (zmm7 ^ zmm4)) -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm22 & (zmm4 ^ zmm0)) -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm7)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm16)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm2)) +; AVX512-NEXT: vpandq %zmm31, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm31, %zmm27 +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm28 +; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm11, %zmm11 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & ~zmm27) | zmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm8[0,0,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm21[2,1,3,2] +; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm19 +; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm8, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm27 & (zmm8 ^ zmm7)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm7 & (zmm11 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm7 & (zmm8 ^ zmm5)) +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm26, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm7 & (zmm2 ^ zmm1)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm5)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm22)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512-NEXT: vmovdqa64 %zmm19, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm10, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm30, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512-NEXT: vmovdqa64 %zmm23, 832(%rax) +; AVX512-NEXT: addq $2808, %rsp # imm = 0xAF8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX512-FCP-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm1 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vporq %ymm0, %ymm5, %ymm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm4 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm4, %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-FCP-NEXT: vporq %ymm5, %ymm7, %ymm17 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm10 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512-FCP-NEXT: vpor %ymm10, %ymm12, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm15 +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm14 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512-FCP-NEXT: vpor %ymm15, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm11 -; AVX512-FCP-NEXT: vporq %ymm11, %ymm6, %ymm21 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm5 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13) -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm6 -; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512-FCP-NEXT: vprold $16, %ymm6, %ymm11 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm16 & (ymm11 ^ ymm12)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm11[0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX512-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11 -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [151522058,0,421010202,421010202] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm23 & (zmm1 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] -; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm1 -; AVX512-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm2 -; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] -; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm23 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm8 +; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm9 +; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512-FCP-NEXT: vporq %ymm9, %ymm7, %ymm18 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [151522058,0,421010202,421010202] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm25 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,2,2,3,10,9,11,11] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm14 +; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [218894094,0,488382238,488382238] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7,8],ymm7[9],ymm15[10,11],ymm7[12],ymm15[13,14,15] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm19 & (zmm0 ^ zmm14)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[1,1,1,1,5,5,5,5] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [5,0,0,0,6,0,0,6] +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,0,0,0,5,0,0] +; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u,u,u],zero,zero +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & ymm3) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm1[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,4,5,4,5,5,7,4,5,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpandn %ymm1, %ymm7, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm21 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512-FCP-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512-FCP-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm1 +; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,0,3,10,10,11,11] +; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm19 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512-FCP-NEXT: vprold $16, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,2,10,10,10,11] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,0,0,0,14,0,0,14,0,0,11,15,0,14,15] +; AVX512-FCP-NEXT: vpermd %zmm24, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm28, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm2 +; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm15 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm18 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm1 & (zmm4 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 -; AVX512-FCP-NEXT: vprold $16, %ymm25, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm12 & (zmm0 ^ zmm3)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (mem & (zmm3 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-FCP-NEXT: vprold $16, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm30 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm22 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8,9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vprold $16, %xmm6, %xmm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vprold $16, %xmm8, %xmm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,0,9] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm4 & (zmm5 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm4 & (zmm1 ^ zmm2)) ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm2 -; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 -; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm30 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm20 & (zmm30 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm5)) -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX512-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,1,8,9,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512-FCP-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm7 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm6 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm4 & (zmm8 ^ zmm5)) +; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm9 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm4 & (zmm10 ^ zmm7)) ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 -; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5 -; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm29 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm20 & (zmm29 ^ zmm4)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm2 & (zmm29 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[2,2,2,2,6,6,6,6] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm7 +; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm7 & (zmm29 ^ zmm4)) +; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm31 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm17 & (zmm31 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm2 & (zmm31 ^ zmm1)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm2 & (zmm29 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[2,2,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[0,1,1,3,4,5,5,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,2,2,3,8,9,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8,9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (zmm1 & (zmm2 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,0,2,1,4,4,6,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm8 ^ (mem & (zmm2 ^ zmm8)) +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512-FCP-NEXT: vprold $16, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[0,0,2,1,4,4,6,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7,8,9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,4,5,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpermd %zmm24, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm0 & (zmm17 ^ zmm1)) ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm13 ^ (zmm23 & (zmm8 ^ zmm13)) -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm25[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm16 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm4)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm2)) -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm7 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,1,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm19 & (zmm4 ^ zmm3)) +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512-FCP-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm2)) +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm23[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm27 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm14 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm23 & (zmm14 ^ zmm0)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [6,7,3,3,7,7,6,7] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm0 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm6)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm26 & (zmm19 ^ zmm8)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm12 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9] +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm22 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm24, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm14 ^ (zmm19 & (zmm12 ^ zmm14)) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [6,7,3,3,7,7,6,7] +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm8 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm14 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm23 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm24, %zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = (zmm23 & ~zmm14) | zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm27 & (zmm23 ^ zmm4)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512-FCP-NEXT: vprold $16, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,1,8,8,9,0] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm20 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,8,9,0] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm19 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3],xmm1[4],xmm13[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 +; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm28 & (zmm12 ^ zmm0)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm22 = ymm22 ^ (ymm23 & (ymm22 ^ ymm2)) -; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm15 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,9,9,0,0,0,1,1] -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm28 & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm24 & (ymm21 ^ ymm0)) +; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm7 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,0,1,1] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm31 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,0,1,1] -; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm1)) -; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm14)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm0 & (zmm15 ^ zmm12)) -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm10 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm26 = ymm10[0,0,1,1] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm10 +; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm30 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandq %zmm4, %zmm7, %zmm15 +; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm7 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm18 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm14 & (zmm7 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm27 & (zmm7 ^ zmm12)) +; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & ~zmm2) | zmm15 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm14 & (zmm12 ^ zmm1)) +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm1[1],xmm15[2,3],xmm1[4],xmm15[5,6],xmm1[7] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm1 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm28 & (zmm1 ^ zmm9)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 -; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm5 +; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm2 = m32bcst ^ (ymm0 & (ymm2 ^ m32bcst)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm6 & (ymm4 ^ ymm22)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm3[0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm14 & (zmm4 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm1 & (ymm0 ^ m32bcst)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm5 & (ymm6 ^ ymm21)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm3[0,1,2,3] +; AVX512-FCP-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = (zmm1 & zmm6) | mem -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = (zmm3 & zmm6) | mem -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX512-FCP-NEXT: vpternlogq $220, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = (zmm2 & ~mem) | zmm0 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm18) | mem +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = (zmm5 & zmm3) | mem +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX512-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -13788,389 +13982,391 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] -; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512-FCP-NEXT: vpandn %ymm13, %ymm14, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm14 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,4,5,4,5,5,7,4,5,2,2,6,6,6,6] +; AVX512-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[16,17],zero,zero +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512-FCP-NEXT: vpandnq %ymm15, %ymm16, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm15)) +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm28 & (zmm6 ^ zmm4)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm28 & (zmm6 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm28 & (zmm8 ^ zmm4)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm1 ^ (zmm4 & (zmm6 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm4 & (zmm8 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm3 & (zmm6 ^ zmm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm3 & (zmm8 ^ zmm5)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm1 | (zmm0 & mem) -; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm12 = zmm12 | (zmm0 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (mem & (zmm3 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5 | (zmm1 & mem) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpternlogq $236, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (zmm0 & mem) | zmm14 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 640(%rax) ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 768(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512-FCP-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $2648, %rsp # imm = 0xA58 -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm6 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm15 +; AVX512DQ-NEXT: subq $2808, %rsp # imm = 0xAF8 +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm0 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm12 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm8 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm3 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm3 -; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm18 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm8, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm7, %ymm4 -; AVX512DQ-NEXT: vporq %ymm2, %ymm4, %ymm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm12, %ymm4 +; AVX512DQ-NEXT: vporq %ymm3, %ymm4, %ymm18 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm5 +; AVX512DQ-NEXT: vporq %ymm3, %ymm5, %ymm19 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm29 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm24 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm5, %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm23 -; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 -; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm9 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm30 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm6 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm5 +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm31 +; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm26 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm14 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm14, %ymm5 +; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm5 +; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512DQ-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm13, %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm9 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm4 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm4, %ymm10 ; AVX512DQ-NEXT: vpor %ymm10, %ymm9, %ymm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm9 -; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm27 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm9 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm27 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3] -; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,3] +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm9 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm10[3,3,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm10[3,3,3,3] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm10[2,2,2,2] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm10[2,2,2,2] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4],ymm0[5],ymm10[6,7,8,9],ymm0[10],ymm10[11,12],ymm0[13],ymm10[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] ; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm9 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm18, %zmm6 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm9, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm18, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6)) -; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (mem & (zmm7 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,u,u,u,u],zero,zero ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm12) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11) -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm12) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm0 & ymm1) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7)) -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm1 & (ymm7 ^ ymm0)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm8[2,2,2,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm7)) +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm0[0,1,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm6 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX512DQ-NEXT: vpandn %ymm7, %ymm12, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm17 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-NEXT: vpbroadcastd 72(%rax), %ymm6 -; AVX512DQ-NEXT: vpandnq %ymm6, %ymm30, %ymm6 -; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm7 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm7, %ymm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3] +; AVX512DQ-NEXT: vpandn %ymm8, %ymm15, %ymm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm18 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-NEXT: vpbroadcastd 72(%rax), %ymm7 +; AVX512DQ-NEXT: vpandnq %ymm7, %ymm25, %ymm7 +; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm8 +; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm0 ; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm18 -; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm22 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm9 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm16 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,0,1,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm8 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm11 -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,0,2,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm9 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm12 -; AVX512DQ-NEXT: vpandnq %ymm12, %ymm30, %ymm12 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 -; AVX512DQ-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm13, %ymm13 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm25 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm31 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512DQ-NEXT: vpandnq %ymm12, %ymm25, %ymm12 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm15 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm15 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm28 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm29 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm17 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm19 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,2,1] -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm14 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm19 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm16 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,2,1] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,0,1,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2] -; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,2] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm13[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm12, %ymm10 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm12, %ymm0 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3] -; AVX512DQ-NEXT: vpandnq %ymm12, %ymm17, %ymm12 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7,8,9,10],ymm12[11],ymm10[12,13],ymm12[14],ymm10[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm14 -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm4, %ymm10 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpandnq %ymm12, %ymm18, %ymm12 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8,9,10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm15 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7,8,9],ymm12[10],ymm0[11,12],ymm12[13],ymm0[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm3[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vprold $16, %ymm0, %ymm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2] -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vprold $16, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm6 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,1,1,5,5,5,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm2 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm10 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm28[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm6 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm31[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vprold $16, %ymm5, %ymm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[1,2,2,3,5,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm28[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm31[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15] @@ -14182,8 +14378,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3] @@ -14191,11 +14387,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm2 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15] @@ -14206,8 +14402,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm5 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm2 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -14218,896 +14414,939 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm1 -; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm2 -; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpandq %zmm15, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm12 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm8 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1] +; AVX512DQ-NEXT: vprold $16, %xmm9, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm1 ^ (zmm0 & (zmm18 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3] -; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm3, %zmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX512DQ-NEXT: vprold $16, %xmm8, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm2 ^ (zmm1 & (zmm14 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] -; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm5, %zmm2 -; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm4 -; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm18 & (zmm23 ^ zmm2)) -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm15, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3],xmm4[4],xmm6[5,6],xmm4[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm3, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1 +; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm3 +; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm16 & (zmm3 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm5 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX512DQ-NEXT: vprold $16, %xmm19, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm16[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm1 & (zmm7 ^ zmm4)) -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm1 -; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm2 -; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm19 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm18 & (zmm19 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1] +; AVX512DQ-NEXT: vprold $16, %xmm22, %xmm2 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm20[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm0 & (zmm10 ^ zmm3)) +; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm0 +; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1 +; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm30 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm17 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm1 & (zmm30 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm8, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,0,2,1,4,4,6,5] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15] +; AVX512DQ-NEXT: vprold $16, %ymm6, %ymm0 +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[1,2,2,3,5,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[3,3,3,3,7,7,7,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15] +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,2,3,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm4[2,1,3,2] +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm11 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7] ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm1[2,1,3,2] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm1 -; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm10 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,0,2,1,4,4,6,5] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX512DQ-NEXT: vprold $16, %ymm8, %ymm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[1,2,2,3,5,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6,7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5],ymm6[6],ymm1[7,8,9,10],ymm6[11],ymm1[12,13],ymm6[14],ymm1[15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15] -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm2[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm4[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm5[2,1,3,2] -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm11 -; AVX512DQ-NEXT: vprold $16, %xmm11, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7] -; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm3[0,2,2,3] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5,7,6] -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24 -; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm17, %zmm2 -; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm2[0,2,2,3] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm4 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,5,7,6] +; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm26 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm14, %zmm1 +; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm13 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm13 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm15, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm5 & (zmm13 ^ zmm1)) ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm8 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm1[2,1,3,3] +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm7 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm16 = xmm1[0,0,1,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm0[2,1,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm4[0,0,1,1] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm1 = mem[2,1,3,2] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm31 & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm20 = ymm20 ^ (ymm30 & (ymm20 ^ ymm1)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm1 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm1 = (zmm1 & zmm30) | mem -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm30, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = (zmm4 & zmm30) | mem +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm0 = mem[2,1,3,2] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm31 & (ymm0 ^ ymm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm25 & (ymm21 ^ ymm0)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm25) | mem ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm20 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm25, %zmm4 +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = (zmm4 & zmm20) | mem ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm30 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm2 ^ (zmm22 & (zmm30 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm1 ^ (zmm2 & (zmm30 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm22 & (zmm5 ^ zmm1)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm2 & (zmm5 ^ zmm4)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm16 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm1 ^ (mem & (zmm16 ^ zmm1)) -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm18 & (ymm0 ^ m32bcst)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm1 & (ymm21 ^ ymm0)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm18 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm18 = zmm18 | (zmm1 & mem) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm20 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm25 ^ (zmm14 & (zmm20 ^ zmm25)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm0 ^ (zmm3 & (zmm20 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm25 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm0 ^ (zmm14 & (zmm25 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm4 ^ (zmm3 & (zmm25 ^ zmm4)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm29 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm0 ^ (mem & (zmm29 ^ zmm0)) +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0 +; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm17 & (ymm0 ^ m32bcst)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm2 & (ymm23 ^ ymm0)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm24 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm24 = zmm24 | (zmm2 & mem) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm0[0,1,2,3],mem[0,1,2,3] -; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = (zmm3 & zmm1) | mem -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm20[0,1,2,3],zmm21[0,1,2,3] +; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm4 +; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm4 = (zmm4 & zmm0) | mem +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm21[0,1,2,3],zmm23[0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpandq %zmm14, %zmm0, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm2 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm21, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm4 & (zmm20 ^ zmm1)) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm3, %zmm23 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm14, %zmm21 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = (zmm23 & ~zmm21) | zmm0 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm21, %zmm21 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm4 & (zmm21 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm1 & (zmm23 ^ zmm14)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm7)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm1 ^ (zmm14 & (zmm7 ^ zmm1)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm1 +; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ zmm0)) +; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm0 +; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm21 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq $186, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm0 = (zmm0 & ~zmm5) | mem +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm5 & (zmm17 ^ zmm18)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm5 & (zmm30 ^ zmm10)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm21 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm5 ^ (zmm10 & (zmm21 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm18, %zmm22 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm5 ^ (zmm10 & (zmm22 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-NEXT: vpermd %zmm2, %zmm10, %zmm10 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm6 & (zmm10 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm2, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm6, %zmm5 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,0,0,5,5,0,4,5,0,0,6,6,0,6] +; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm1 & (zmm10 ^ zmm21)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm1 & (zmm18 ^ zmm22)) ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm28, %zmm25 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm1 ^ (zmm14 & (zmm25 ^ zmm1)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm28, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm14)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm29, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm28, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm2)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm25)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm22 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm9 & (zmm22 ^ zmm1)) +; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm7 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (zmm9 & (zmm7 ^ zmm0)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[0,1,1,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm9 & (zmm12 ^ zmm0)) -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm9 & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm16[0,1,1,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm19, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm9 & (zmm5 ^ zmm1)) +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3],xmm9[4],xmm10[5,6],xmm9[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm8 = mem[2,2,2,3] -; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm10 = mem[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,2,2] +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm7 = mem[2,2,2,3] +; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm11 = mem[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm7 ; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm11 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm25 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm4 = mem[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm4[0,0,1,3] -; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm17 = mem[0,0,1,1] -; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm26 = mem[2,2,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,1,3] -; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm28 = mem[2,3,3,3,6,7,7,7] -; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm29 = mem[0,0,2,1] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3] +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm12 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm15 = mem[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,1,3] +; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm16 = mem[0,0,1,1] +; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1] +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm19 = mem[2,2,2,3] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,1,3] +; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm21 = mem[2,3,3,3,6,7,7,7] +; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm26 = mem[0,0,2,1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm11[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm8)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm26, %zmm4 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm28[2,1,3,2] -; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm11 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm31 & (zmm8 ^ zmm4)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm12)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm25, %zmm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm17, %zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm4 ^ (zmm22 & (zmm7 ^ zmm4)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm29, %zmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm22 & (zmm4 ^ zmm0)) -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm7)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm0 & (zmm13 ^ zmm4)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm16)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm2)) +; AVX512DQ-NEXT: vpandq %zmm31, %zmm7, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm27, %zmm31, %zmm27 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] +; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm28 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm11, %zmm11 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & ~zmm27) | zmm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm8[0,0,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm19, %zmm7 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm21[2,1,3,2] +; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm19 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm27 & (zmm8 ^ zmm7)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm7 & (zmm11 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm7 & (zmm8 ^ zmm5)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm12, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm7 & (zmm5 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm26, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm7 & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm5)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm29)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm22)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm10, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm30, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm20, 832(%rax) -; AVX512DQ-NEXT: addq $2648, %rsp # imm = 0xA58 +; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm30, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm17, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 704(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 640(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 576(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, 512(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm24, 768(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm23, 832(%rax) +; AVX512DQ-NEXT: addq $2808, %rsp # imm = 0xAF8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8 +; AVX512DQ-FCP-NEXT: subq $1560, %rsp # imm = 0x618 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm5, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm4 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vporq %ymm5, %ymm7, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm31 +; AVX512DQ-FCP-NEXT: vpor %ymm10, %ymm12, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm15 +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm14 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm15 -; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm13 -; AVX512DQ-FCP-NEXT: vpor %ymm15, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm11 -; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm6, %ymm21 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,u,u,u,u],zero,zero -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13) -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm11 & ymm16) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm20 -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm6, %ymm11 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm16 & (ymm11 ^ ymm12)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm11[0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm7[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7,8,9,10],ymm4[11],ymm11[12,13],ymm4[14],ymm11[15] -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [151522058,0,421010202,421010202] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm24 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2,3],ymm2[4],ymm7[5,6,7,8],ymm2[9],ymm7[10,11],ymm2[12],ymm7[13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,2,2,3,10,9,11,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [218894094,0,488382238,488382238] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13,14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm23 & (zmm1 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpandn %ymm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,1,0,3,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm21, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm14[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm23 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm12 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm11 +; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm7, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [151522058,0,421010202,421010202] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm25 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,2,2,3,10,9,11,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm14 +; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [218894094,0,488382238,488382238] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7,8],ymm7[9],ymm15[10,11],ymm7[12],ymm15[13,14,15] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27] +; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7,8,9],ymm0[10],ymm15[11,12],ymm0[13],ymm15[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm30, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm19 & (zmm0 ^ zmm14)) ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] +; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm25[1,1,1,1,5,5,5,5] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [5,0,0,0,6,0,0,6] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,0,0,0,5,0,0] +; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u,u,u],zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & ymm3) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm17) +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm2 & (ymm0 ^ ymm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (ymm2 & (ymm1 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm1[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,4,5,4,5,5,7,4,5,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpandn %ymm1, %ymm7, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vpandn %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm15 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm21 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,0,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm28[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm9 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6,7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm10[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm19 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,2,10,10,10,11] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm10 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,0,0,0,14,0,0,14,0,0,11,15,0,14,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm24, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm28, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm18 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm1 & (zmm4 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm13 -; AVX512DQ-FCP-NEXT: vprold $16, %ymm25, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm25 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [2,2,3,3,10,9,11,10] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm27, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm12 & (zmm0 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm29[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm2 ^ (mem & (zmm3 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vprold $16, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm30 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm22 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7,8,9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,3,3,10,9,11,10] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vprold $16, %xmm6, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,1,1,8,8,0,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm7 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm12, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vprold $16, %xmm8, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,0,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm28, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm4 & (zmm5 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (zmm4 & (zmm1 ^ zmm2)) ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm29, %zmm2 -; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6 -; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm30 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm20 & (zmm30 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [0,0,0,1,8,9,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, (%rsp) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6],xmm7[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm24 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm4 & (zmm8 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm28, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm4 & (zmm10 ^ zmm7)) ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm5 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm4 -; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5 -; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm29 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm20 & (zmm29 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm2 & (zmm29 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm20[2,2,2,2,6,6,6,6] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm2[2],ymm6[3,4],ymm2[5],ymm6[6,7,8,9],ymm2[10],ymm6[11,12],ymm2[13],ymm6[14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm11, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm7 +; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm7 & (zmm29 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm31 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm17 & (zmm31 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm2 & (zmm31 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm2 & (zmm29 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[2,2,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm12[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[0,1,1,3,4,5,5,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7,8,9],ymm2[10],ymm4[11,12],ymm2[13],ymm4[14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,2,2,3,8,9,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8,9,10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm4 ^ (zmm1 & (zmm2 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7,8,9],ymm1[10],ymm4[11,12],ymm1[13],ymm4[14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[0,0,2,1,4,4,6,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm4 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm8 ^ (mem & (zmm2 ^ zmm8)) +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vprold $16, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm20 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm18[0,0,2,1,4,4,6,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm8[3],ymm1[4,5],ymm8[6],ymm1[7,8,9,10],ymm8[11],ymm1[12,13],ymm8[14],ymm1[15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,4,5,4,5,5,7,4,5,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpermd %zmm24, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm0 & (zmm17 ^ zmm1)) ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm19, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm13 ^ (zmm23 & (zmm8 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm22 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm25[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm16 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm7 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm8 = [2,1,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm19 & (zmm4 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm21 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31] +; AVX512DQ-FCP-NEXT: # ymm21 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm20[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm23[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm27 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm14 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,2,2,3,8,8,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm0 ^ (zmm23 & (zmm14 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [6,7,3,3,7,7,6,7] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm0 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm19 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm23 & (zmm19 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm26 & (zmm19 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm26 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm12 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,2,3,8,8,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm24, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm14 ^ (zmm19 & (zmm12 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [6,7,3,3,7,7,6,7] +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm8 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm14 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm23 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm24, %zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = (zmm23 & ~zmm14) | zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm27 & (zmm23 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,3] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7,8,9],ymm5[10],ymm1[11,12],ymm5[13],ymm1[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm15 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,0,1,8,8,9,0] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm21, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm20 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm25 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,0,0,1,8,8,9,0] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm19 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3],xmm1[4],xmm13[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm0 ^ (zmm28 & (zmm12 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm22 = ymm22 ^ (ymm23 & (ymm22 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [8,9,9,0,0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm28 & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm21 = ymm21 ^ (ymm24 & (ymm21 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [8,9,9,0,0,0,1,1] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm24, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,0,1,1] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm31 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm7[0,0,1,1] -; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm18, %ymm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm23 & (zmm7 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm0 & (zmm15 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm26 = ymm10[0,0,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm30 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandq %zmm4, %zmm7, %zmm15 +; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm25, %ymm7 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm18 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm14 & (zmm7 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm27 & (zmm7 ^ zmm12)) +; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & ~zmm2) | zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm14 & (zmm12 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] ; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,1,2,3] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm1[1],xmm15[2,3],xmm1[4],xmm15[5,6],xmm1[7] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm22, %zmm1 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm28 & (zmm1 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm27, %zmm6 -; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm5 +; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm18 & (zmm5 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm2 = m32bcst ^ (ymm0 & (ymm2 ^ m32bcst)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm0 & (ymm3 ^ ymm2)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm6 & (ymm4 ^ ymm22)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm4[0,1,2,3],zmm3[0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm14 & (zmm4 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = m32bcst ^ (ymm1 & (ymm0 ^ m32bcst)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm5 & (ymm6 ^ ymm21)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm3[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [12,13,10,10,14,14,14,14,14,15,11,11,15,15,14,15] ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = (zmm1 & zmm6) | mem -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = (zmm3 & zmm6) | mem -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7] +; AVX512DQ-FCP-NEXT: vpternlogq $220, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = (zmm2 & ~mem) | zmm0 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm18) | mem +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm3 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = (zmm5 & zmm3) | mem +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX512DQ-FCP-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload @@ -15115,64 +15354,65 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15] ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23] -; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm12 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,4,5,4,5,5,7] -; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[16,17],zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] -; AVX512DQ-FCP-NEXT: vpandn %ymm13, %ymm14, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] +; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm14 = mem[1,1,1,1,5,5,5,5] +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7,8,9],ymm14[10],ymm11[11,12],ymm14[13],ymm11[14,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm14 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,4,5,4,5,5,7,4,5,2,2,6,6,6,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[16,17],zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535] +; AVX512DQ-FCP-NEXT: vpandnq %ymm15, %ymm16, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm15)) +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm28 & (zmm6 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm28 & (zmm6 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm3 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm8 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm4 ^ (zmm28 & (zmm8 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm1 ^ (zmm4 & (zmm6 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm4 & (zmm8 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm3 & (zmm6 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm3 & (zmm8 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 | (zmm0 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm12 = zmm12 | (zmm0 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (mem & (zmm3 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5 | (zmm1 & mem) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq $236, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & mem) | zmm14 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 640(%rax) ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 768(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax) -; AVX512DQ-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8 +; AVX512DQ-FCP-NEXT: addq $1560, %rsp # imm = 0x618 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index d1fd4a360036b..14cdfb38212c0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -51,22 +51,22 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vmovaps (%rsi), %xmm1 ; AVX-NEXT: vmovaps (%rdx), %xmm2 -; AVX-NEXT: vmovaps (%r9), %xmm3 -; AVX-NEXT: vmovaps (%r10), %xmm4 +; AVX-NEXT: vmovaps (%r8), %xmm3 +; AVX-NEXT: vmovaps (%r9), %xmm4 +; AVX-NEXT: vmovaps (%r10), %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm6 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm7 +; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3] ; AVX-NEXT: vinsertf128 $1, (%rcx), %ymm1, %ymm1 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 -; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX-NEXT: vmovaps (%r8), %xmm6 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7 -; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[2],ymm0[3] -; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm4[1] -; AVX-NEXT: vmovaps %xmm2, 96(%rax) -; AVX-NEXT: vmovaps %ymm1, 64(%rax) -; AVX-NEXT: vmovapd %ymm0, 32(%rax) -; AVX-NEXT: vmovaps %ymm5, (%rax) +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm5[1] +; AVX-NEXT: vmovaps %xmm1, 96(%rax) +; AVX-NEXT: vmovaps %ymm0, (%rax) +; AVX-NEXT: vmovaps %ymm3, 64(%rax) +; AVX-NEXT: vmovapd %ymm6, 32(%rax) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 674bad2c7aa87..faa89f3c9495e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -1040,38 +1040,38 @@ define void @store_i8_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX-NEXT: vmovdqa (%rsi), %xmm2 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX-NEXT: vmovdqa (%rdx), %xmm6 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX-NEXT: vmovdqa (%rdx), %xmm5 +; AVX-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2 ; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, 64(%rcx) ; AVX-NEXT: vmovdqa %xmm4, 80(%rcx) ; AVX-NEXT: vmovdqa %xmm3, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm5, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm5, (%rcx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; @@ -1631,122 +1631,112 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride3_vf64: ; AVX: # %bb.0: -; AVX-NEXT: subq $40, %rsp ; AVX-NEXT: vmovdqa (%rdi), %xmm8 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 ; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rdx), %xmm5 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 -; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm0 +; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm3 +; AVX-NEXT: vmovdqa (%rdx), %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 32(%rdx), %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] +; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm4 +; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm6 +; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm0, %xmm3 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm4 -; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm7 +; AVX-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10] -; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128] +; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128] ; AVX-NEXT: vmovdqa 16(%rsi), %xmm13 -; AVX-NEXT: vmovdqa 32(%rsi), %xmm14 -; AVX-NEXT: vmovdqa 48(%rsi), %xmm15 -; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm12 -; AVX-NEXT: vpor %xmm3, %xmm12, %xmm0 +; AVX-NEXT: vmovdqa 32(%rsi), %xmm11 +; AVX-NEXT: vmovdqa 48(%rsi), %xmm14 +; AVX-NEXT: vpshufb %xmm12, %xmm14, %xmm15 +; AVX-NEXT: vpor %xmm2, %xmm15, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm6 -; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm12 -; AVX-NEXT: vpor %xmm6, %xmm12, %xmm0 +; AVX-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm15 +; AVX-NEXT: vpor %xmm5, %xmm15, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm12 -; AVX-NEXT: vpor %xmm9, %xmm12, %xmm0 +; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm15 +; AVX-NEXT: vpor %xmm9, %xmm15, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa (%rsi), %xmm2 +; AVX-NEXT: vmovdqa (%rsi), %xmm15 ; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm10 -; AVX-NEXT: vpor %xmm8, %xmm10, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm5, %xmm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] -; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm8 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] -; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm6 -; AVX-NEXT: vmovdqa 32(%rdx), %xmm12 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm5 +; AVX-NEXT: vpshufb %xmm12, %xmm15, %xmm10 +; AVX-NEXT: vpor %xmm8, %xmm10, %xmm5 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] +; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] +; AVX-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX-NEXT: vmovdqa 16(%rdx), %xmm0 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] +; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm2 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] +; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm3 ; AVX-NEXT: vmovdqa 48(%rdx), %xmm10 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4] -; AVX-NEXT: vpor %xmm7, %xmm11, %xmm11 -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4] -; AVX-NEXT: vpor %xmm4, %xmm11, %xmm11 -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm1, %xmm11 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm14[8],xmm10[9],xmm14[9],xmm10[10],xmm14[10],xmm10[11],xmm14[11],xmm10[12],xmm14[12],xmm10[13],xmm14[13],xmm10[14],xmm14[14],xmm10[15],xmm14[15] +; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm9 +; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4] +; AVX-NEXT: vpor %xmm7, %xmm12, %xmm12 +; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX-NEXT: vpor %xmm4, %xmm12, %xmm12 +; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm11 +; AVX-NEXT: vpor %xmm5, %xmm11, %xmm5 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpor %xmm1, %xmm14, %xmm14 -; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm0, %xmm3 -; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVX-NEXT: vpor %xmm0, %xmm14, %xmm14 -; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm15 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm2 -; AVX-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm4 -; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm12 -; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8 -; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm9 -; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm0 -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm5 +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm11 +; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX-NEXT: vpshufb %xmm9, %xmm15, %xmm3 +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm11 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm11 -; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm1 -; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm12 +; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm13 +; AVX-NEXT: vpor %xmm13, %xmm11, %xmm11 +; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX-NEXT: vpshufb %xmm8, %xmm10, %xmm8 +; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6 +; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 +; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm10 +; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm2 +; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm9 +; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm1 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7 -; AVX-NEXT: vmovdqa %xmm3, 80(%rcx) -; AVX-NEXT: vmovdqa %xmm4, 64(%rcx) -; AVX-NEXT: vmovdqa %xmm0, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm5, 32(%rcx) +; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm12 +; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm7 +; AVX-NEXT: vmovdqa %xmm2, 80(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 64(%rcx) +; AVX-NEXT: vmovdqa %xmm5, 16(%rcx) ; AVX-NEXT: vmovdqa %xmm10, 48(%rcx) ; AVX-NEXT: vmovdqa %xmm7, 176(%rcx) -; AVX-NEXT: vmovdqa %xmm8, 160(%rcx) -; AVX-NEXT: vmovdqa %xmm11, 96(%rcx) -; AVX-NEXT: vmovdqa %xmm6, 112(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 128(%rcx) +; AVX-NEXT: vmovdqa %xmm6, 160(%rcx) +; AVX-NEXT: vmovdqa %xmm11, 112(%rcx) ; AVX-NEXT: vmovdqa %xmm12, 144(%rcx) -; AVX-NEXT: addq $40, %rsp +; AVX-NEXT: vmovdqa %xmm4, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm9, 96(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 128(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: store_i8_stride3_vf64: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll index 21b98dbb3843e..9db7309411061 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-4.ll @@ -877,38 +877,38 @@ define void @store_i8_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride4_vf32: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; AVX-NEXT: vmovdqa (%rcx), %xmm2 -; AVX-NEXT: vmovdqa 16(%rcx), %xmm3 -; AVX-NEXT: vmovdqa (%rdx), %xmm6 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm7 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vmovdqa (%rsi), %ymm1 +; AVX-NEXT: vmovdqa (%rdx), %ymm2 +; AVX-NEXT: vmovdqa (%rcx), %ymm3 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX-NEXT: vextractf128 $1, %ymm3, %xmm6 +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm8 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm2 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 -; AVX-NEXT: vmovaps %ymm2, (%r8) +; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm3 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 ; AVX-NEXT: vmovaps %ymm1, 96(%r8) ; AVX-NEXT: vmovaps %ymm3, 64(%r8) ; AVX-NEXT: vmovaps %ymm0, 32(%r8) +; AVX-NEXT: vmovaps %ymm2, (%r8) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -1291,74 +1291,74 @@ define void @store_i8_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX-LABEL: store_i8_stride4_vf64: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rsi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 -; AVX-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX-NEXT: vmovdqa 48(%rsi), %xmm4 -; AVX-NEXT: vmovdqa (%rdi), %xmm6 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm10 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; AVX-NEXT: vmovdqa (%rdi), %ymm0 +; AVX-NEXT: vmovdqa (%rsi), %ymm2 +; AVX-NEXT: vmovdqa (%rdx), %ymm7 +; AVX-NEXT: vmovdqa (%rcx), %ymm8 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX-NEXT: vmovdqa 48(%rsi), %xmm11 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm12 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm2[8],xmm9[9],xmm2[9],xmm9[10],xmm2[10],xmm9[11],xmm2[11],xmm9[12],xmm2[12],xmm9[13],xmm2[13],xmm9[14],xmm2[14],xmm9[15],xmm2[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; AVX-NEXT: vmovdqa (%rcx), %xmm6 -; AVX-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX-NEXT: vmovdqa 48(%rcx), %xmm12 -; AVX-NEXT: vmovdqa (%rdx), %xmm8 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm13 -; AVX-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3],xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm10[8],xmm14[9],xmm10[9],xmm14[10],xmm10[10],xmm14[11],xmm10[11],xmm14[12],xmm10[12],xmm14[13],xmm10[13],xmm14[14],xmm10[14],xmm14[15],xmm10[15] -; AVX-NEXT: vmovdqa 48(%rdx), %xmm10 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX-NEXT: vextractf128 $1, %ymm8, %xmm10 +; AVX-NEXT: vextractf128 $1, %ymm7, %xmm11 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX-NEXT: vmovdqa 32(%rcx), %xmm14 +; AVX-NEXT: vmovdqa 32(%rdx), %xmm15 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15] +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX-NEXT: vmovdqa 48(%rcx), %xmm10 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX-NEXT: vmovdqa 48(%rdx), %xmm15 +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; AVX-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3] -; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3] +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm13, %ymm6 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm14, %ymm1 -; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7 -; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 +; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8 +; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm14, %ymm2 +; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm13, %ymm6 +; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0 -; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4 -; AVX-NEXT: vmovaps %ymm5, 64(%r8) -; AVX-NEXT: vmovaps %ymm1, 96(%r8) -; AVX-NEXT: vmovaps %ymm7, 128(%r8) +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5 +; AVX-NEXT: vmovaps %ymm3, 160(%r8) +; AVX-NEXT: vmovaps %ymm5, 224(%r8) ; AVX-NEXT: vmovaps %ymm0, 192(%r8) -; AVX-NEXT: vmovaps %ymm2, 160(%r8) -; AVX-NEXT: vmovaps %ymm4, 224(%r8) -; AVX-NEXT: vmovaps %ymm3, (%r8) -; AVX-NEXT: vmovaps %ymm6, 32(%r8) +; AVX-NEXT: vmovaps %ymm6, 128(%r8) +; AVX-NEXT: vmovaps %ymm2, 96(%r8) +; AVX-NEXT: vmovaps %ymm4, 64(%r8) +; AVX-NEXT: vmovaps %ymm1, (%r8) +; AVX-NEXT: vmovaps %ymm8, 32(%r8) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index ad9db98711a62..9200cdc178f44 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -717,7 +717,9 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm4) ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -747,7 +749,9 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm4) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -777,7 +781,9 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512DQ-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm4) ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -807,7 +813,9 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm4, %ymm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm4) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -1321,10 +1329,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm5 & (zmm4 | zmm3) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm3 & ~zmm5) | zmm4 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] @@ -1334,7 +1344,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1357,10 +1367,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm5 & (zmm4 | zmm3) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm3 & ~zmm5) | zmm4 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] @@ -1370,7 +1382,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -1396,10 +1408,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,2,2,0] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 -; AVX512DQ-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm5 & (zmm4 | zmm3) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm3 & ~zmm5) | zmm4 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] @@ -1409,7 +1423,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -1432,10 +1446,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vporq %zmm3, %zmm4, %zmm3 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm5 & (zmm4 | zmm3) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm3 & ~zmm5) | zmm4 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[5,13,u],zero,zero,xmm2[6,14,u],zero,zero,xmm2[7,15,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] @@ -1445,7 +1461,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2396,10 +2412,13 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm7 & (ymm5 ^ ymm6)) ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm9[0,1,2,3] -; AVX512-NEXT: vmovdqa (%r8), %xmm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512-NEXT: vpandq %zmm6, %zmm5, %zmm5 +; AVX512-NEXT: vmovdqa (%r8), %xmm8 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512-NEXT: vpermd %zmm8, %zmm9, %zmm8 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = (zmm8 & ~zmm6) | zmm5 ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u] ; AVX512-NEXT: vpor %ymm5, %ymm8, %ymm5 @@ -2469,34 +2488,37 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm8[0,1,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm8[0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512-FCP-NEXT: vpandq %zmm5, %zmm4, %zmm7 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] +; AVX512-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm8 & ~zmm5) | zmm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] -; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] ; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm10 & (ymm8 ^ ymm7)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] -; AVX512-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] ; AVX512-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm9[0,1,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm9[0,1,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6] ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & mem) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] ; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 @@ -2511,7 +2533,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512-FCP-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2545,10 +2567,13 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm7 & (ymm5 ^ ymm6)) ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm9[0,1,2,3] -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512DQ-NEXT: vpandq %zmm6, %zmm5, %zmm5 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm8 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] +; AVX512DQ-NEXT: vpermd %zmm8, %zmm9, %zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm6, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = (zmm8 & ~zmm6) | zmm5 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u] ; AVX512DQ-NEXT: vpor %ymm5, %ymm8, %ymm5 @@ -2618,34 +2643,37 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm8[0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpandq %zmm5, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = (zmm8 & ~zmm5) | zmm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19] ; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm10 & (ymm8 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u] -; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25] ; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5)) -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm9[0,1,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm7)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,2,3],zmm9[0,1,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6] ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm7 & mem) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30] ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 @@ -2660,7 +2688,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 128(%r9) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -4677,11 +4705,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512-NEXT: vporq %xmm0, %xmm1, %xmm21 ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm0 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512-NEXT: vpshufb %ymm10, %ymm15, %ymm0 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] -; AVX512-NEXT: vpshufb %ymm10, %ymm14, %ymm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] +; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm1 ; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm22 ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] @@ -4721,10 +4749,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb %ymm15, %ymm14, %ymm14 ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm27 ; AVX512-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX512-NEXT: vpshufb %ymm10, %ymm14, %ymm10 ; AVX512-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512-NEXT: vporq %ymm3, %ymm10, %ymm16 +; AVX512-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512-NEXT: vporq %ymm10, %ymm3, %ymm16 ; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm0 ; AVX512-NEXT: vpshufb %ymm15, %ymm13, %ymm3 ; AVX512-NEXT: vporq %ymm0, %ymm3, %ymm17 @@ -4796,205 +4824,227 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & zmm15) ; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm24[2,2,3,3,6,6,7,7] ; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm26[2,2,3,3,6,6,7,7] -; AVX512-NEXT: vporq %zmm3, %zmm7, %zmm3 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512-NEXT: vpermq {{.*#+}} zmm10 = zmm25[2,2,3,3,6,6,7,7] -; AVX512-NEXT: vpermq {{.*#+}} zmm11 = zmm27[2,2,3,3,6,6,7,7] -; AVX512-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm11 & (zmm10 ^ zmm3)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] -; AVX512-NEXT: vpermd %zmm7, %zmm3, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm28 & (zmm3 ^ zmm10)) -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,2,3,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = ~zmm10 & (zmm7 | zmm3) +; AVX512-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512-NEXT: vpermq {{.*#+}} zmm11 = zmm25[2,2,3,3,6,6,7,7] +; AVX512-NEXT: vpermq {{.*#+}} zmm12 = zmm27[2,2,3,3,6,6,7,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 & (zmm12 | zmm11) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm28 & (zmm12 | zmm7) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512-NEXT: vpermd %zmm3, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm28, %zmm14 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm7 & ~zmm14) | zmm12 +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,3,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm19[2,2,3,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm12 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (zmm8 & (zmm12 ^ zmm10)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm7 ^ (zmm8 & (zmm12 ^ zmm7)) ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) +; AVX512-NEXT: vpandnq %zmm0, %zmm10, %zmm0 +; AVX512-NEXT: vpandq %zmm10, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm11 & (zmm1 | zmm0) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,6,7,7,7,7,24,24,24,24,24,24,25,25] +; AVX512-NEXT: vpermi2d %zmm3, %zmm3, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm0 & ~zmm7) | zmm1 +; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm5[0,0,1,1,4,4,5,5] +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm2[0,0,1,1,4,4,5,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm1)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm12 & mem) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] -; AVX512-NEXT: vpermd %zmm7, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm5[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-NEXT: vpermd %zmm7, %zmm1, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) ; AVX512-NEXT: vmovdqa64 %zmm13, 64(%r9) -; AVX512-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r9) +; AVX512-NEXT: vmovdqa64 %zmm7, 128(%r9) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride5_vf64: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11] -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9,u,11,u],zero,xmm1[10],zero,xmm1[12,u],zero -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512-FCP-NEXT: vporq %xmm0, %xmm2, %xmm22 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7 -; AVX512-FCP-NEXT: vporq %ymm6, %ymm7, %ymm23 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[6],zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9],zero,xmm1[11,u],zero,xmm1[10],zero,xmm1[12] -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6],zero,xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[11],zero,xmm1[u,10],zero,xmm1[12],zero -; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512-FCP-NEXT: vporq %xmm6, %xmm7, %xmm24 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm7 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512-FCP-NEXT: vporq %ymm7, %ymm4, %ymm25 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm9 -; AVX512-FCP-NEXT: vporq %ymm4, %ymm9, %ymm26 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm13 -; AVX512-FCP-NEXT: vporq %ymm4, %ymm13, %ymm27 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm7 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm29 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[8],zero,xmm14[u,7],zero,xmm14[9],zero,xmm14[u],zero,xmm14[u,10],zero,xmm14[12],zero,xmm14[u,11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm22 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm23 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] +; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm30 +; AVX512-FCP-NEXT: vporq %xmm2, %xmm3, %xmm24 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm13 -; AVX512-FCP-NEXT: vporq %ymm7, %ymm13, %ymm28 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] -; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm15 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm15, %ymm16 -; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm25 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm4 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] +; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm5 +; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm8 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm13 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8 -; AVX512-FCP-NEXT: vporq %ymm3, %ymm8, %ymm19 -; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm18 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20 -; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm2[8,u],zero,xmm2[7],zero,xmm2[9,u,11,u],zero,xmm2[10],zero,xmm2[12,u],zero -; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm5 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512-FCP-NEXT: vporq %ymm13, %ymm15, %ymm26 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm13 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm11 +; AVX512-FCP-NEXT: vporq %ymm0, %ymm11, %ymm16 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm17 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm15 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm15, %ymm18 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm19 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vporq %ymm2, %ymm1, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11] +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm6 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] -; AVX512-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX512-FCP-NEXT: vpandn %ymm9, %ymm10, %ymm9 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm9 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,0,5,5,5,5,0,6] -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX512-FCP-NEXT: vpandn %ymm11, %ymm9, %ymm11 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm14 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3],xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm10, %xmm10 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,1,2,2,2,2,2,2] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm15 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX512-FCP-NEXT: vpandnq %ymm15, %ymm27, %ymm15 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm15 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [4,0,5,5,5,5,0,6] +; AVX512-FCP-NEXT: vpermd %ymm15, %ymm28, %ymm28 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX512-FCP-NEXT: vpandnq %ymm28, %ymm15, %ymm28 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm11, %zmm11 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,8,8,9,9] +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm20 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm0 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,1,8,8,9,9] -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9] +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm22[0,0,1,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,0,1,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm4, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm5 & (zmm4 ^ zmm2)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm4 & zmm10) -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm26[2,2,3,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm28[2,2,3,3] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm5 & (zmm4 ^ zmm2)) -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,10,11,11] -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm2 & (zmm18 ^ zmm17)) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm18)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm2 & (zmm0 ^ zmm20)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem) -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%r9) +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm4 & (zmm5 ^ zmm3)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm22[0,0,1,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm24[0,0,1,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm7 & (zmm6 ^ zmm3)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm6 & zmm27) +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,2,3,3,10,10,11,11] +; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512-FCP-NEXT: vpandnq %zmm8, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpandq %zmm3, %zmm13, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 & (zmm8 | zmm6) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm10 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm6 & ~zmm10) | zmm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm17[2,2,3,3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm16, %zmm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm19[2,2,3,3] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm7 & (zmm8 ^ zmm6)) +; AVX512-FCP-NEXT: vpandnq %zmm20, %zmm3, %zmm6 +; AVX512-FCP-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm9 & (zmm0 | zmm6) +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm2 & ~zmm3) | zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm8 & mem) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -5019,11 +5069,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29 ; AVX512DQ-NEXT: vporq %xmm0, %xmm1, %xmm21 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm15 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm15, %ymm0 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm14, %ymm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm1 ; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm22 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm6 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] @@ -5063,10 +5113,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm14, %ymm14 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm27 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm14 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm14, %ymm10 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm13 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512DQ-NEXT: vporq %ymm3, %ymm10, %ymm16 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm3 +; AVX512DQ-NEXT: vporq %ymm10, %ymm3, %ymm16 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm3 ; AVX512DQ-NEXT: vporq %ymm0, %ymm3, %ymm17 @@ -5138,205 +5188,227 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm7 & zmm15) ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm24[2,2,3,3,6,6,7,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm26[2,2,3,3,6,6,7,7] -; AVX512DQ-NEXT: vporq %zmm3, %zmm7, %zmm3 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm10 = zmm25[2,2,3,3,6,6,7,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm11 = zmm27[2,2,3,3,6,6,7,7] -; AVX512DQ-NEXT: vporq %zmm10, %zmm11, %zmm10 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm11 & (zmm10 ^ zmm3)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm3, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm28 & (zmm3 ^ zmm10)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm17[2,2,3,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = ~zmm10 & (zmm7 | zmm3) +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm11 = zmm25[2,2,3,3,6,6,7,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm12 = zmm27[2,2,3,3,6,6,7,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 & (zmm12 | zmm11) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm28 & (zmm12 | zmm7) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm7, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm28, %zmm14 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm7 & ~zmm14) | zmm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm17[2,2,3,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm19[2,2,3,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm12 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (zmm8 & (zmm12 ^ zmm10)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm7 ^ (zmm8 & (zmm12 ^ zmm7)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm11 & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vpandnq %zmm0, %zmm10, %zmm0 +; AVX512DQ-NEXT: vpandq %zmm10, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm11 & (zmm1 | zmm0) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,6,7,7,7,7,24,24,24,24,24,24,25,25] +; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm3, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm0 & ~zmm7) | zmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm5[0,0,1,1,4,4,5,5] +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm2[0,0,1,1,4,4,5,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-NEXT: vpermd %zmm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm1)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm12 & mem) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm5[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[0,0,1,1,4,4,5,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%r9) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 256(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%r9) ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r9) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 128(%r9) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride5_vf64: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9,u,11,u],zero,xmm1[10],zero,xmm1[12,u],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm30 -; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm2, %xmm22 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7 -; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm7, %ymm23 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[6],zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9],zero,xmm1[11,u],zero,xmm1[10],zero,xmm1[12] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6],zero,xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[11],zero,xmm1[u,10],zero,xmm1[12],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm21 -; AVX512DQ-FCP-NEXT: vporq %xmm6, %xmm7, %xmm24 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4 -; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm4, %ymm25 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] -; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm9 -; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm9, %ymm26 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm13 -; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm13, %ymm27 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] -; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm7 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm14 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[8],zero,xmm14[u,7],zero,xmm14[9],zero,xmm14[u],zero,xmm14[u,10],zero,xmm14[12],zero,xmm14[u,11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm28 +; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm22 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12] +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm30 +; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm3, %xmm24 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] +; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm13 -; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm13, %ymm28 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128] -; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30] -; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm15 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm15, %ymm16 -; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm25 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm4 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0] +; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm5 +; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm13 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8 -; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm8, %ymm19 -; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm3 -; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm18 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20 -; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm1 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm2[8,u],zero,xmm2[7],zero,xmm2[9,u,11,u],zero,xmm2[10],zero,xmm2[12,u],zero -; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm15 +; AVX512DQ-FCP-NEXT: vporq %ymm13, %ymm15, %ymm26 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128] +; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm11 +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm11, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm15 +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm15, %ymm18 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm19 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,1,2,2,2,2,2,2] -; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm10, %ymm9 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm9 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [4,0,5,5,5,5,0,6] -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] -; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm9, %ymm11 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm12 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3],xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm10, %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [1,1,2,2,2,2,2,2] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm15, %ymm15 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] +; AVX512DQ-FCP-NEXT: vpandnq %ymm15, %ymm27, %ymm15 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm15 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [4,0,5,5,5,5,0,6] +; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm28, %ymm28 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0] +; AVX512DQ-FCP-NEXT: vpandnq %ymm28, %ymm15, %ymm28 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm11, %zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [2,2,3,3,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm20 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm28, %zmm0 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,0,1,1,8,8,9,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm3, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm3 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm22[0,0,1,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,0,1,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm4, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm5 & (zmm4 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm4 & zmm10) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm26[2,2,3,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm28[2,2,3,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm27, %zmm4 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm5 & (zmm4 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,10,11,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm2 & (zmm18 ^ zmm17)) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [12,0,13,13,13,13,0,14,14,14,14,0,15,15,15,15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm18)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm2 & (zmm0 ^ zmm20)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm4 & mem) -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm4 & (zmm5 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm22[0,0,1,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm24[0,0,1,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (zmm7 & (zmm6 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm6 & zmm27) +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,2,3,3,10,10,11,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255] +; AVX512DQ-FCP-NEXT: vpandnq %zmm8, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpandq %zmm3, %zmm13, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 & (zmm8 | zmm6) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [12,14,13,13,13,13,12,14,14,14,14,14,15,15,15,15] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm10 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm6 & ~zmm10) | zmm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm17[2,2,3,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm19[2,2,3,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm18, %zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm7 & (zmm8 ^ zmm6)) +; AVX512DQ-FCP-NEXT: vpandnq %zmm20, %zmm3, %zmm6 +; AVX512DQ-FCP-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm9 & (zmm0 | zmm6) +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [6,6,6,0,7,7,7,7,8,8,8,8,8,8,9,9] +; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm2 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm2 & ~zmm3) | zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 | (zmm8 & mem) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%r9) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%r9) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -5384,8 +5456,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512BW-NEXT: kmovq %rax, %k4 ; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm5 {%k4} -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] -; AVX512BW-NEXT: vpermd %zmm0, %zmm14, %zmm14 +; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,24,24,24,24,24,24,25,25] +; AVX512BW-NEXT: vpermi2d %zmm0, %zmm0, %zmm14 ; AVX512BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512BW-NEXT: kmovq %rax, %k2 ; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} @@ -5702,8 +5774,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631 ; AVX512DQ-BW-NEXT: kmovq %rax, %k4 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm5 {%k4} -; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9] -; AVX512DQ-BW-NEXT: vpermd %zmm0, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [6,6,6,6,7,7,7,7,24,24,24,24,24,24,25,25] +; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm0, %zmm14 ; AVX512DQ-BW-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842 ; AVX512DQ-BW-NEXT: kmovq %rax, %k2 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 03f5b90002d34..a044eee673de8 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -2608,386 +2608,422 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-LABEL: store_i8_stride6_vf32: ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512-NEXT: vmovdqa (%r8), %ymm0 -; AVX512-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512-NEXT: vprold $16, %ymm9, %ymm9 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm6 ^ (zmm14 & (zmm13 ^ zmm6)) -; AVX512-NEXT: vmovdqa (%r9), %xmm11 -; AVX512-NEXT: vmovdqa (%r8), %xmm12 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512-NEXT: vmovdqa (%r8), %ymm8 +; AVX512-NEXT: vmovdqa (%r9), %ymm10 +; AVX512-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm6 +; AVX512-NEXT: vpandnq %zmm4, %zmm6, %zmm7 +; AVX512-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[16],ymm12[16],ymm11[17],ymm12[17],ymm11[18],ymm12[18],ymm11[19],ymm12[19],ymm11[20],ymm12[20],ymm11[21],ymm12[21],ymm11[22],ymm12[22],ymm11[23],ymm12[23] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %xmm13, %xmm9, %xmm15 -; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 -; AVX512-NEXT: vpermq {{.*#+}} zmm15 = zmm13[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm7[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 ^ (mem & (zmm8 ^ zmm15)) -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512-NEXT: vpshufb %xmm7, %xmm11, %xmm9 -; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ zmm8)) -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm10 -; AVX512-NEXT: vpshufb %ymm8, %ymm2, %ymm8 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm3 & (zmm2 ^ zmm8)) -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm2 -; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 +; AVX512-NEXT: vpandq %zmm6, %zmm9, %zmm15 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm16 & (zmm15 | zmm7) +; AVX512-NEXT: vmovdqa (%r9), %xmm6 +; AVX512-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm3 & ~zmm9) | zmm15 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-NEXT: vpshufb %ymm3, %ymm12, %ymm15 +; AVX512-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[4],ymm15[4],ymm3[5],ymm15[5],ymm3[6],ymm15[6],ymm3[7],ymm15[7],ymm3[16],ymm15[16],ymm3[17],ymm15[17],ymm3[18],ymm15[18],ymm3[19],ymm15[19],ymm3[20],ymm15[20],ymm3[21],ymm15[21],ymm3[22],ymm15[22],ymm3[23],ymm15[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15],ymm13[24],ymm14[24],ymm13[25],ymm14[25],ymm13[26],ymm14[26],ymm13[27],ymm14[27],ymm13[28],ymm14[28],ymm13[29],ymm14[29],ymm13[30],ymm14[30],ymm13[31],ymm14[31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 +; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 +; AVX512-NEXT: vpandnq %zmm3, %zmm18, %zmm3 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm15 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15],ymm11[24],ymm12[24],ymm11[25],ymm12[25],ymm11[26],ymm12[26],ymm11[27],ymm12[27],ymm11[28],ymm12[28],ymm11[29],ymm12[29],ymm11[30],ymm12[30],ymm11[31],ymm12[31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpandq %zmm18, %zmm11, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm17 & (zmm12 | zmm3) +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm3 +; AVX512-NEXT: vpshufb %ymm11, %ymm8, %ymm13 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[4],ymm3[4],ymm13[5],ymm3[5],ymm13[6],ymm3[6],ymm13[7],ymm3[7],ymm13[16],ymm3[16],ymm13[17],ymm3[17],ymm13[18],ymm3[18],ymm13[19],ymm3[19],ymm13[20],ymm3[20],ymm13[21],ymm3[21],ymm13[22],ymm3[22],ymm13[23],ymm3[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15],ymm8[24],ymm10[24],ymm8[25],ymm10[25],ymm8[26],ymm10[26],ymm8[27],ymm10[27],ymm8[28],ymm10[28],ymm8[29],ymm10[29],ymm8[30],ymm10[30],ymm8[31],ymm10[31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm3 & ~zmm8) | zmm12 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-NEXT: vpshufb %xmm3, %xmm4, %xmm10 +; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vprold $16, %xmm10, %xmm10 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm2 & (zmm1 ^ zmm3)) +; AVX512-NEXT: vpshufb %xmm11, %xmm6, %xmm1 +; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm2)) +; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride6_vf32: ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm6 ^ (zmm9 & (zmm8 ^ zmm6)) -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm6 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15],ymm9[24],ymm10[24],ymm9[25],ymm10[25],ymm9[26],ymm10[26],ymm9[27],ymm10[27],ymm9[28],ymm10[28],ymm9[29],ymm10[29],ymm9[30],ymm10[30],ymm9[31],ymm10[31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpandnq %zmm1, %zmm2, %zmm3 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm11 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[4],ymm4[4],ymm11[5],ymm4[5],ymm11[6],ymm4[6],ymm11[7],ymm4[7],ymm11[16],ymm4[16],ymm11[17],ymm4[17],ymm11[18],ymm4[18],ymm11[19],ymm4[19],ymm11[20],ymm4[20],ymm11[21],ymm4[21],ymm11[22],ymm4[22],ymm11[23],ymm4[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vpandq %zmm2, %zmm4, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm4 & (zmm11 | zmm3) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[16],ymm3[16],ymm12[17],ymm3[17],ymm12[18],ymm3[18],ymm12[19],ymm3[19],ymm12[20],ymm3[20],ymm12[21],ymm3[21],ymm12[22],ymm3[22],ymm12[23],ymm3[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm3[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm12 & ~zmm3) | zmm11 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[16],ymm10[16],ymm9[17],ymm10[17],ymm9[18],ymm10[18],ymm9[19],ymm10[19],ymm9[20],ymm10[20],ymm9[21],ymm10[21],ymm9[22],ymm10[22],ymm9[23],ymm10[23] +; AVX512-FCP-NEXT: vprold $16, %ymm9, %ymm11 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm11 +; AVX512-FCP-NEXT: vpandnq %zmm12, %zmm11, %zmm12 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512-FCP-NEXT: vprold $16, %xmm14, %xmm14 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm13[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm15 & (zmm10 ^ zmm14)) -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm4 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm15 & (zmm3 ^ zmm5)) -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm0 +; AVX512-FCP-NEXT: vpandq %zmm11, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm13 & (zmm0 | zmm12) +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm6 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm13, %zmm11 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm12 & ~zmm11) | zmm0 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0)) +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm16 & (zmm0 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride6_vf32: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm6 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm13 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm6 ^ (zmm14 & (zmm13 ^ zmm6)) -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm12 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm14 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512DQ-NEXT: vprold $16, %ymm3, %ymm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm6 +; AVX512DQ-NEXT: vpandnq %zmm4, %zmm6, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm11[0],ymm12[0],ymm11[1],ymm12[1],ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[4],ymm12[4],ymm11[5],ymm12[5],ymm11[6],ymm12[6],ymm11[7],ymm12[7],ymm11[16],ymm12[16],ymm11[17],ymm12[17],ymm11[18],ymm12[18],ymm11[19],ymm12[19],ymm11[20],ymm12[20],ymm11[21],ymm12[21],ymm11[22],ymm12[22],ymm11[23],ymm12[23] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13)) -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm9, %xmm15 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm13 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm15[8],xmm13[9],xmm15[9],xmm13[10],xmm15[10],xmm13[11],xmm15[11],xmm13[12],xmm15[12],xmm13[13],xmm15[13],xmm13[14],xmm15[14],xmm13[15],xmm15[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-NEXT: vprold $16, %xmm15, %xmm15 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm15, %zmm13 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm15 = zmm13[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm13 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm7, %xmm7 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm8 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm7[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 ^ (mem & (zmm8 ^ zmm15)) -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm9 -; AVX512DQ-NEXT: vpshufb %xmm7, %xmm12, %xmm10 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm14 & (zmm9 ^ zmm8)) -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm10 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm2, %ymm8 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm5, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm13, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm3 & (zmm2 ^ zmm8)) -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm2 -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[16],ymm2[16],ymm4[17],ymm2[17],ymm4[18],ymm2[18],ymm4[19],ymm2[19],ymm4[20],ymm2[20],ymm4[21],ymm2[21],ymm4[22],ymm2[22],ymm4[23],ymm2[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9 +; AVX512DQ-NEXT: vpandq %zmm6, %zmm9, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm16 & (zmm15 | zmm7) +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm10[0],ymm8[1],ymm10[1],ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[4],ymm10[4],ymm8[5],ymm10[5],ymm8[6],ymm10[6],ymm8[7],ymm10[7],ymm8[16],ymm10[16],ymm8[17],ymm10[17],ymm8[18],ymm10[18],ymm8[19],ymm10[19],ymm8[20],ymm10[20],ymm8[21],ymm10[21],ymm8[22],ymm10[22],ymm8[23],ymm10[23] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,1,0,3,4,5,6,7,10,9,8,11,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm3 & ~zmm9) | zmm15 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm12, %ymm15 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm11, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[4],ymm15[4],ymm3[5],ymm15[5],ymm3[6],ymm15[6],ymm3[7],ymm15[7],ymm3[16],ymm15[16],ymm3[17],ymm15[17],ymm3[18],ymm15[18],ymm3[19],ymm15[19],ymm3[20],ymm15[20],ymm3[21],ymm15[21],ymm3[22],ymm15[22],ymm3[23],ymm15[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15],ymm13[24],ymm14[24],ymm13[25],ymm14[25],ymm13[26],ymm14[26],ymm13[27],ymm14[27],ymm13[28],ymm14[28],ymm13[29],ymm14[29],ymm13[30],ymm14[30],ymm13[31],ymm14[31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18 +; AVX512DQ-NEXT: vpandnq %zmm3, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm15 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm14, %ymm14 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm13 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[16],ymm14[16],ymm13[17],ymm14[17],ymm13[18],ymm14[18],ymm13[19],ymm14[19],ymm13[20],ymm14[20],ymm13[21],ymm14[21],ymm13[22],ymm14[22],ymm13[23],ymm14[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm11[8],ymm12[8],ymm11[9],ymm12[9],ymm11[10],ymm12[10],ymm11[11],ymm12[11],ymm11[12],ymm12[12],ymm11[13],ymm12[13],ymm11[14],ymm12[14],ymm11[15],ymm12[15],ymm11[24],ymm12[24],ymm11[25],ymm12[25],ymm11[26],ymm12[26],ymm11[27],ymm12[27],ymm11[28],ymm12[28],ymm11[29],ymm12[29],ymm11[30],ymm12[30],ymm11[31],ymm12[31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm11 = zmm11[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpandq %zmm18, %zmm11, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm17 & (zmm12 | zmm3) +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm11 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm3 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm8, %ymm13 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm13[0],ymm3[0],ymm13[1],ymm3[1],ymm13[2],ymm3[2],ymm13[3],ymm3[3],ymm13[4],ymm3[4],ymm13[5],ymm3[5],ymm13[6],ymm3[6],ymm13[7],ymm3[7],ymm13[16],ymm3[16],ymm13[17],ymm3[17],ymm13[18],ymm3[18],ymm13[19],ymm3[19],ymm13[20],ymm3[20],ymm13[21],ymm3[21],ymm13[22],ymm3[22],ymm13[23],ymm3[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm8[8],ymm10[8],ymm8[9],ymm10[9],ymm8[10],ymm10[10],ymm8[11],ymm10[11],ymm8[12],ymm10[12],ymm8[13],ymm10[13],ymm8[14],ymm10[14],ymm8[15],ymm10[15],ymm8[24],ymm10[24],ymm8[25],ymm10[25],ymm8[26],ymm10[26],ymm8[27],ymm10[27],ymm8[28],ymm10[28],ymm8[29],ymm10[29],ymm8[30],ymm10[30],ymm8[31],ymm10[31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm3 & ~zmm8) | zmm12 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm4, %xmm10 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm10 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm10, %zmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm3 = zmm3[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm2 & (zmm1 ^ zmm3)) +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm6, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf32: ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm6 ^ (zmm9 & (zmm8 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm6 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15],ymm9[24],ymm10[24],ymm9[25],ymm10[25],ymm9[26],ymm10[26],ymm9[27],ymm10[27],ymm9[28],ymm10[28],ymm9[29],ymm10[29],ymm9[30],ymm10[30],ymm9[31],ymm10[31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpandnq %zmm1, %zmm2, %zmm3 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm4 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm11 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm11[0],ymm4[0],ymm11[1],ymm4[1],ymm11[2],ymm4[2],ymm11[3],ymm4[3],ymm11[4],ymm4[4],ymm11[5],ymm4[5],ymm11[6],ymm4[6],ymm11[7],ymm4[7],ymm11[16],ymm4[16],ymm11[17],ymm4[17],ymm11[18],ymm4[18],ymm11[19],ymm4[19],ymm11[20],ymm4[20],ymm11[21],ymm4[21],ymm11[22],ymm4[22],ymm11[23],ymm4[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm4, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm4 & (zmm11 | zmm3) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[16],ymm3[16],ymm12[17],ymm3[17],ymm12[18],ymm3[18],ymm12[19],ymm3[19],ymm12[20],ymm3[20],ymm12[21],ymm3[21],ymm12[22],ymm3[22],ymm12[23],ymm3[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27,24,25,22,23,28,29,26,27,28,29,30,31] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm3, %zmm3 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm3[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = (zmm12 & ~zmm3) | zmm11 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[16],ymm10[16],ymm9[17],ymm10[17],ymm9[18],ymm10[18],ymm9[19],ymm10[19],ymm9[20],ymm10[20],ymm9[21],ymm10[21],ymm9[22],ymm10[22],ymm9[23],ymm10[23] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm9, %ymm11 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,0,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vpandnq %zmm12, %zmm11, %zmm12 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm13 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm14, %xmm14 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm13 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm13[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm15, %zmm10 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm15 & (zmm10 ^ zmm14)) -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm4 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,1,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm3 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm15 & (zmm3 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm14, %zmm0 +; AVX512DQ-FCP-NEXT: vpandq %zmm11, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm13 & (zmm0 | zmm12) +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,16,17,22,23,24,25,24,25,24,25,24,25] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,10,11,8,9,6,7,12,13,10,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm13, %zmm11 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm12 & ~zmm11) | zmm0 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm4 & (zmm1 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,2,3,0,1,6,7,8,9,8,9,8,9,8,9] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm16 & (zmm0 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -5312,1228 +5348,1294 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-LABEL: store_i8_stride6_vf64: ; AVX512: # %bb.0: ; AVX512-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa (%r8), %xmm10 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512-NEXT: vmovdqa (%r9), %xmm12 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm0 -; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512-NEXT: vpshufb %xmm9, %xmm6, %xmm4 -; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm5 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm4 +; AVX512-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-NEXT: vmovdqa64 %xmm8, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm21 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX512-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %ymm15, %ymm4, %ymm0 -; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0 +; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm11[8],ymm6[9],ymm11[9],ymm6[10],ymm11[10],ymm6[11],ymm11[11],ymm6[12],ymm11[12],ymm6[13],ymm11[13],ymm6[14],ymm11[14],ymm6[15],ymm11[15],ymm6[24],ymm11[24],ymm6[25],ymm11[25],ymm6[26],ymm11[26],ymm6[27],ymm11[27],ymm6[28],ymm11[28],ymm6[29],ymm11[29],ymm6[30],ymm11[30],ymm6[31],ymm11[31] +; AVX512-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512-NEXT: vpshufb %ymm12, %ymm13, %ymm0 -; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm1 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31] -; AVX512-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm2 +; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm3 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15],ymm9[24],ymm10[24],ymm9[25],ymm10[25],ymm9[26],ymm10[26],ymm9[27],ymm10[27],ymm9[28],ymm10[28],ymm9[29],ymm10[29],ymm9[30],ymm10[30],ymm9[31],ymm10[31] +; AVX512-NEXT: vmovdqa64 %ymm9, %ymm16 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm5, %ymm13, %ymm0 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX512-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm4, %ymm13, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpshufb %ymm4, %ymm13, %ymm3 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 32(%r9), %ymm14 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm0 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 -; AVX512-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm26 -; AVX512-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm6 -; AVX512-NEXT: vpshufb %ymm12, %ymm15, %ymm9 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[16],ymm6[16],ymm9[17],ymm6[17],ymm9[18],ymm6[18],ymm9[19],ymm6[19],ymm9[20],ymm6[20],ymm9[21],ymm6[21],ymm9[22],ymm6[22],ymm9[23],ymm6[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15],ymm15[24],ymm10[24],ymm15[25],ymm10[25],ymm15[26],ymm10[26],ymm15[27],ymm10[27],ymm15[28],ymm10[28],ymm15[29],ymm10[29],ymm15[30],ymm10[30],ymm15[31],ymm10[31] -; AVX512-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm24 -; AVX512-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23 +; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm2 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm7, %ymm14, %ymm15 +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm29 +; AVX512-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm0 +; AVX512-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15],ymm10[24],ymm15[24],ymm10[25],ymm15[25],ymm10[26],ymm15[26],ymm10[27],ymm15[27],ymm10[28],ymm15[28],ymm10[29],ymm15[29],ymm10[30],ymm15[30],ymm10[31],ymm15[31] +; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 +; AVX512-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm8 +; AVX512-NEXT: vpshufb %ymm12, %ymm6, %ymm5 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[4],ymm8[4],ymm5[5],ymm8[5],ymm5[6],ymm8[6],ymm5[7],ymm8[7],ymm5[16],ymm8[16],ymm5[17],ymm8[17],ymm5[18],ymm8[18],ymm5[19],ymm8[19],ymm5[20],ymm8[20],ymm5[21],ymm8[21],ymm5[22],ymm8[22],ymm5[23],ymm8[23] +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15],ymm6[24],ymm9[24],ymm6[25],ymm9[25],ymm6[26],ymm9[26],ymm6[27],ymm9[27],ymm6[28],ymm9[28],ymm6[29],ymm9[29],ymm6[30],ymm9[30],ymm6[31],ymm9[31] +; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm8, %ymm8 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm25 +; AVX512-NEXT: vmovdqa (%r8), %ymm5 +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm8 +; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm23 ; AVX512-NEXT: vmovdqa (%r9), %ymm4 ; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm25 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23] -; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm24 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[16],ymm11[16],ymm0[17],ymm11[17],ymm0[18],ymm11[18],ymm0[19],ymm11[19],ymm0[20],ymm11[20],ymm0[21],ymm11[21],ymm0[22],ymm11[22],ymm0[23],ymm11[23] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm10 -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23] +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[4],ymm3[4],ymm11[5],ymm3[5],ymm11[6],ymm3[6],ymm11[7],ymm3[7],ymm11[16],ymm3[16],ymm11[17],ymm3[17],ymm11[18],ymm3[18],ymm11[19],ymm3[19],ymm11[20],ymm3[20],ymm11[21],ymm3[21],ymm11[22],ymm3[22],ymm11[23],ymm3[23] -; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm7 -; AVX512-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm5 -; AVX512-NEXT: vpshufb %ymm11, %ymm13, %ymm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm13 -; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX512-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm15 +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm10 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm6, %ymm13, %ymm11 +; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm13 +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm14 ; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm14 -; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm8 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm3 +; AVX512-NEXT: vpshufb %xmm12, %xmm8, %xmm9 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; AVX512-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512-NEXT: vpshufb %xmm12, %xmm15, %xmm1 -; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm8 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX512-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm30 -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,0,0,1] -; AVX512-NEXT: vprold $16, %ymm16, %ymm8 -; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm8 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm18[0,0,0,1] +; AVX512-NEXT: vprold $16, %ymm16, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm20[0,0,0,1] -; AVX512-NEXT: vprold $16, %ymm17, %ymm14 -; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm31[0,0,0,1] -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm22[0,0,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-NEXT: vpandnq %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512-NEXT: vpandq %zmm3, %zmm7, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm9 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm9 & (zmm7 | zmm2) +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,0,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,0,0,1] +; AVX512-NEXT: vprold $16, %ymm17, %ymm15 +; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm22[0,0,0,1] +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm8 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm2 ^ (zmm9 & (zmm8 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm2 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm8)) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,2,2,3] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm7 ^ (zmm9 & (zmm1 ^ zmm7)) -; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm31, %zmm6 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm1)) -; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm1 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm2)) -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm6)) +; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512-NEXT: vpandnq %zmm2, %zmm9, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 & (zmm2 | zmm7) +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm7 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm12 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm7 & ~zmm12) | zmm2 +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm15[2,2,2,3] +; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm30, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm2 ^ (zmm3 & (zmm8 ^ zmm2)) +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm31 & (zmm2 ^ zmm8)) +; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm12 & (zmm4 ^ zmm2)) ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm4 & (zmm3 ^ zmm0)) +; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm5 & (zmm2 ^ zmm0)) ; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm3)) -; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm0)) -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm4 & (zmm6 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm6 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1)) +; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = mem[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm6 & (zmm1 ^ zmm2)) +; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm11 & (zmm2 ^ zmm0)) ; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm6)) -; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm0)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm8 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm0 ^ (zmm8 & (zmm1 ^ zmm0)) ; AVX512-NEXT: vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload ; AVX512-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm9 & (zmm5 ^ zmm0)) +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0)) ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm24[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (zmm9 & (zmm7 ^ zmm0)) -; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm23[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm4 & (zmm5 ^ zmm7)) -; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm7 & (zmm4 ^ zmm0)) -; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm7 & (zmm0 ^ zmm5)) +; AVX512-NEXT: vpermq {{.*#+}} zmm10 = zmm25[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm3 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm5, %zmm0 +; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512-NEXT: # zmm3 = mem[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm1)) +; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm23[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm10)) +; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm29[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 +; AVX512-NEXT: vpermq {{.*#+}} zmm5 = zmm24[2,2,2,3,6,6,6,7] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm3 & (zmm1 ^ zmm5)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride6_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $280, %rsp # imm = 0x118 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-FCP-NEXT: vprold $16, %ymm0, %ymm18 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,10,10,10,11] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm10 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512-FCP-NEXT: subq $232, %rsp +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,1,10,10,10,11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm9 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm2 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm30 -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm19 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm16 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm29 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm20 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm27 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm16 +; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm4 +; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm14 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm11 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm11 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm1 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23] +; AVX512-FCP-NEXT: vprold $16, %ymm0, %ymm21 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm18 & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm0 ^ (zmm21 & (zmm9 ^ zmm0)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm10 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm0 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm24 & (zmm0 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] -; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm25 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] -; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] -; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm20 -; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm13 -; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm9 ^ (zmm18 & (zmm13 ^ zmm9)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm9 -; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm24 & (zmm9 ^ zmm13)) -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm12 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm13 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm12 & (zmm3 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm12 & (zmm8 ^ zmm9)) -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm9 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm12 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[16],ymm9[16],ymm12[17],ymm9[17],ymm12[18],ymm9[18],ymm12[19],ymm9[19],ymm12[20],ymm9[20],ymm12[21],ymm9[21],ymm12[22],ymm9[22],ymm12[23],ymm9[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,2,3,10,10,10,11] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm6 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[16],ymm1[16],ymm6[17],ymm1[17],ymm6[18],ymm1[18],ymm6[19],ymm1[19],ymm6[20],ymm1[20],ymm6[21],ymm1[21],ymm6[22],ymm1[22],ymm6[23],ymm1[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm18 & (zmm1 ^ zmm9)) -; AVX512-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm9 = mem[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm22 & (zmm9 ^ zmm1)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm7 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm1 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm18 & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm30[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm22 & (zmm2 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm9)) +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm28 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm28 & (zmm12 ^ zmm0)) +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm13 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,10,10,10,11] +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[16],ymm2[16],ymm15[17],ymm2[17],ymm15[18],ymm2[18],ymm15[19],ymm2[19],ymm15[20],ymm2[20],ymm15[21],ymm2[21],ymm15[22],ymm2[22],ymm15[23],ymm2[23] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm13 ^ (zmm21 & (zmm2 ^ zmm13)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm29 = zmm20[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm20, %zmm0 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm2)) +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm5 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm2 +; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 +; AVX512-FCP-NEXT: vpandnq %zmm6, %zmm21, %zmm25 +; AVX512-FCP-NEXT: vpandq %zmm21, %zmm14, %zmm26 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm27 +; AVX512-FCP-NEXT: vpandnq %zmm11, %zmm27, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm21 & (zmm3 ^ zmm5)) +; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm0 & (zmm16 ^ zmm3)) ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm5 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,8,8,8,9] -; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm6 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3],xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm4 ^ (zmm22 & (zmm10 ^ zmm4)) -; AVX512-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm4 = mem[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm18 & (zmm4 ^ zmm10)) -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm14 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm10 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm10 ^ (zmm22 & (zmm7 ^ zmm10)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,1,8,8,8,9] +; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm20 & (zmm7 ^ zmm3)) +; AVX512-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm7)) +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm9 ^ (zmm20 & (zmm7 ^ zmm9)) ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm7)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm3 = mem[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm9)) -; AVX512-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm2)) -; AVX512-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm4)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm27 & (zmm26 | zmm25) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 & (zmm2 | zmm26) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & ~zmm28) | zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm5 & (zmm16 ^ zmm4)) +; AVX512-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm29)) +; AVX512-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm7 & (zmm6 ^ zmm3)) +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512-FCP-NEXT: addq $280, %rsp # imm = 0x118 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512-FCP-NEXT: addq $232, %rsp ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride6_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: subq $456, %rsp # imm = 0x1C8 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm6 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm10 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm3 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm7 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm10 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm12 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm8 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm30 -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm8 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm5 -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm27 -; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm6, %xmm4 -; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm5 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm18 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm20 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm28 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm3 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm30 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm12, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm31 +; AVX512DQ-NEXT: vinserti32x4 $2, %xmm5, %zmm6, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm21 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm22 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm20 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm21 +; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm4, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm11[8],ymm6[9],ymm11[9],ymm6[10],ymm11[10],ymm6[11],ymm11[11],ymm6[12],ymm11[12],ymm6[13],ymm11[13],ymm6[14],ymm11[14],ymm6[15],ymm11[15],ymm6[24],ymm11[24],ymm6[25],ymm11[25],ymm6[26],ymm11[26],ymm6[27],ymm11[27],ymm6[28],ymm11[28],ymm6[29],ymm11[29],ymm6[30],ymm11[30],ymm6[31],ymm11[31] +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm18 +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm17 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm13, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm1 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm8[8],ymm13[8],ymm8[9],ymm13[9],ymm8[10],ymm13[10],ymm8[11],ymm13[11],ymm8[12],ymm13[12],ymm8[13],ymm13[13],ymm8[14],ymm13[14],ymm8[15],ymm13[15],ymm8[24],ymm13[24],ymm8[25],ymm13[25],ymm8[26],ymm13[26],ymm8[27],ymm13[27],ymm8[28],ymm13[28],ymm8[29],ymm13[29],ymm8[30],ymm13[30],ymm8[31],ymm13[31] -; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm17 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm3 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15],ymm9[24],ymm10[24],ymm9[25],ymm10[25],ymm9[26],ymm10[26],ymm9[27],ymm10[27],ymm9[28],ymm10[28],ymm9[29],ymm10[29],ymm9[30],ymm10[30],ymm9[31],ymm10[31] +; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm16 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm13, %ymm0 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] +; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm24 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm13, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm13, %ymm3 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm14 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm0 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] -; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm28 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm1, %ymm10 -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm0, %ymm15 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm15 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm15, %ymm6 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm10, %zmm26 -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm10 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm15 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm6 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm15, %ymm9 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm9[0],ymm6[0],ymm9[1],ymm6[1],ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[4],ymm6[4],ymm9[5],ymm6[5],ymm9[6],ymm6[6],ymm9[7],ymm6[7],ymm9[16],ymm6[16],ymm9[17],ymm6[17],ymm9[18],ymm6[18],ymm9[19],ymm6[19],ymm9[20],ymm6[20],ymm9[21],ymm6[21],ymm9[22],ymm6[22],ymm9[23],ymm6[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm15[8],ymm10[8],ymm15[9],ymm10[9],ymm15[10],ymm10[10],ymm15[11],ymm10[11],ymm15[12],ymm10[12],ymm15[13],ymm10[13],ymm15[14],ymm10[14],ymm15[15],ymm10[15],ymm15[24],ymm10[24],ymm15[25],ymm10[25],ymm15[26],ymm10[26],ymm15[27],ymm10[27],ymm15[28],ymm10[28],ymm15[29],ymm10[29],ymm15[30],ymm10[30],ymm15[31],ymm10[31] -; AVX512DQ-NEXT: vpshufb %ymm7, %ymm9, %ymm9 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm24 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm23 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm2 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm14, %ymm15 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm29 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm10 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11],ymm10[12],ymm15[12],ymm10[13],ymm15[13],ymm10[14],ymm15[14],ymm10[15],ymm15[15],ymm10[24],ymm15[24],ymm10[25],ymm15[25],ymm10[26],ymm15[26],ymm10[27],ymm15[27],ymm10[28],ymm15[28],ymm10[29],ymm15[29],ymm10[30],ymm15[30],ymm10[31],ymm15[31] +; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm26 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm6 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshufb %ymm12, %ymm6, %ymm5 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[2],ymm8[2],ymm5[3],ymm8[3],ymm5[4],ymm8[4],ymm5[5],ymm8[5],ymm5[6],ymm8[6],ymm5[7],ymm8[7],ymm5[16],ymm8[16],ymm5[17],ymm8[17],ymm5[18],ymm8[18],ymm5[19],ymm8[19],ymm5[20],ymm8[20],ymm5[21],ymm8[21],ymm5[22],ymm8[22],ymm5[23],ymm8[23] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm6[8],ymm9[8],ymm6[9],ymm9[9],ymm6[10],ymm9[10],ymm6[11],ymm9[11],ymm6[12],ymm9[12],ymm6[13],ymm9[13],ymm6[14],ymm9[14],ymm6[15],ymm9[15],ymm6[24],ymm9[24],ymm6[25],ymm9[25],ymm6[26],ymm9[26],ymm6[27],ymm9[27],ymm6[28],ymm9[28],ymm6[29],ymm9[29],ymm6[30],ymm9[30],ymm6[31],ymm9[31] +; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm8, %ymm8 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm25 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm5 +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm8 +; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm23 ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm4 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm4, %ymm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm25 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm10[0],ymm15[1],ymm10[1],ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[4],ymm10[4],ymm15[5],ymm10[5],ymm15[6],ymm10[6],ymm15[7],ymm10[7],ymm15[16],ymm10[16],ymm15[17],ymm10[17],ymm15[18],ymm10[18],ymm15[19],ymm10[19],ymm15[20],ymm10[20],ymm15[21],ymm10[21],ymm15[22],ymm10[22],ymm15[23],ymm10[23] -; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm7 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm11[0],ymm0[1],ymm11[1],ymm0[2],ymm11[2],ymm0[3],ymm11[3],ymm0[4],ymm11[4],ymm0[5],ymm11[5],ymm0[6],ymm11[6],ymm0[7],ymm11[7],ymm0[16],ymm11[16],ymm0[17],ymm11[17],ymm0[18],ymm11[18],ymm0[19],ymm11[19],ymm0[20],ymm11[20],ymm0[21],ymm11[21],ymm0[22],ymm11[22],ymm0[23],ymm11[23] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm10 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20 -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[16],ymm3[16],ymm8[17],ymm3[17],ymm8[18],ymm3[18],ymm8[19],ymm3[19],ymm8[20],ymm3[20],ymm8[21],ymm3[21],ymm8[22],ymm3[22],ymm8[23],ymm3[23] +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm27 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[2],ymm9[2],ymm6[3],ymm9[3],ymm6[4],ymm9[4],ymm6[5],ymm9[5],ymm6[6],ymm9[6],ymm6[7],ymm9[7],ymm6[16],ymm9[16],ymm6[17],ymm9[17],ymm6[18],ymm9[18],ymm6[19],ymm9[19],ymm6[20],ymm9[20],ymm6[21],ymm9[21],ymm6[22],ymm9[22],ymm6[23],ymm9[23] ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm17 -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm7 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm31 -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm11[0],ymm3[0],ymm11[1],ymm3[1],ymm11[2],ymm3[2],ymm11[3],ymm3[3],ymm11[4],ymm3[4],ymm11[5],ymm3[5],ymm11[6],ymm3[6],ymm11[7],ymm3[7],ymm11[16],ymm3[16],ymm11[17],ymm3[17],ymm11[18],ymm3[18],ymm11[19],ymm3[19],ymm11[20],ymm3[20],ymm11[21],ymm3[21],ymm11[22],ymm3[22],ymm11[23],ymm3[23] -; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm7 -; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm5 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm13, %ymm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm13 -; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm11 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[4],ymm15[4],ymm10[5],ymm15[5],ymm10[6],ymm15[6],ymm10[7],ymm15[7],ymm10[16],ymm15[16],ymm10[17],ymm15[17],ymm10[18],ymm15[18],ymm10[19],ymm15[19],ymm10[20],ymm15[20],ymm10[21],ymm15[21],ymm10[22],ymm15[22],ymm10[23],ymm15[23] +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm21 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm15 +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm10 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm13, %ymm11 +; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm13 +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm6 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm14 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm4 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm14, %ymm3 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm14 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm8 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm8, %xmm9 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] ; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm0 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm15, %xmm1 -; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm8 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm30 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,0,0,1] -; AVX512DQ-NEXT: vprold $16, %ymm16, %ymm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm3 +; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm8 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7] +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; AVX512DQ-NEXT: vprold $16, %xmm1, %xmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm18[0,0,0,1] +; AVX512DQ-NEXT: vprold $16, %ymm16, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm20[0,0,0,1] -; AVX512DQ-NEXT: vprold $16, %ymm17, %ymm14 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm31[0,0,0,1] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm22[0,0,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1] +; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpandq %zmm3, %zmm7, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm9 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm9 & (zmm7 | zmm2) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm15[0,0,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm13[0,0,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm14[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm27[0,0,0,1] +; AVX512DQ-NEXT: vprold $16, %ymm17, %ymm15 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm22[0,0,0,1] +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm8 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm2 ^ (zmm9 & (zmm8 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm8)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,2,2,3] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm7 ^ (zmm9 & (zmm1 ^ zmm7)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm31, %zmm6 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm1)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm2)) -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm6)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm2, %zmm2 +; AVX512DQ-NEXT: vpandnq %zmm2, %zmm9, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 & (zmm2 | zmm7) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm12 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm7 & ~zmm12) | zmm2 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm15[2,2,2,3] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm14, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm30, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm2 ^ (zmm3 & (zmm8 ^ zmm2)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm31 & (zmm2 ^ zmm8)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm12 & (zmm4 ^ zmm2)) ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm4 & (zmm3 ^ zmm0)) +; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm5 & (zmm2 ^ zmm0)) ; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm3)) -; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm0)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm30[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm4 & (zmm6 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm6 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm2)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1)) +; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = mem[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm6 & (zmm1 ^ zmm2)) +; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm11 & (zmm2 ^ zmm0)) ; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm9 & (zmm0 ^ zmm6)) -; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm0)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm8 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm0 ^ (zmm8 & (zmm1 ^ zmm0)) ; AVX512DQ-NEXT: vpermq $234, (%rsp), %zmm0 # 64-byte Folded Reload ; AVX512DQ-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm5 = mem[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm9 & (zmm5 ^ zmm0)) +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = mem[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0)) ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm26[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm24[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm0 ^ (zmm9 & (zmm7 ^ zmm0)) -; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm0 = mem[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm4 & (zmm0 ^ zmm5)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm23[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm4 & (zmm5 ^ zmm7)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm28[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm7 & (zmm4 ^ zmm0)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm25[2,2,2,3,6,6,6,7] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm7 & (zmm0 ^ zmm5)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm10 = zmm25[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm3 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm5, %zmm0 +; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm3 = mem[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm1)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm23[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ zmm10)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm29[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm3 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm5 = zmm24[2,2,2,3,6,6,6,7] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm3 & (zmm1 ^ zmm5)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, 256(%rax) ; AVX512DQ-NEXT: addq $456, %rsp # imm = 0x1C8 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride6_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $280, %rsp # imm = 0x118 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm0, %ymm18 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,0,0,1,10,10,10,11] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: subq $232, %rsp +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,0,1,10,10,10,11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] +; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm21 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0] ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm2 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm22 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm30 -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm19 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm16 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm17 -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm28 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27,16,17,22,23,20,21,18,19,24,25,26,27,28,29,26,27] -; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm6 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm20 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15,0,10,0,13,0,12,0,11,0,14,0,0,0,0,0,15] +; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm7 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0,0,6,0,5,0,8,0,7,0,0,0,9,0,0,0,0] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm27 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm16 +; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm28 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,12,13,10,11,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm14 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm14 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm11 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm1 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[16],ymm8[16],ymm7[17],ymm8[17],ymm7[18],ymm8[18],ymm7[19],ymm8[19],ymm7[20],ymm8[20],ymm7[21],ymm8[21],ymm7[22],ymm8[22],ymm7[23],ymm8[23] +; AVX512DQ-FCP-NEXT: vprold $16, %ymm0, %ymm21 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm18 & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm0 ^ (zmm21 & (zmm9 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm0 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm24 & (zmm0 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[2],ymm7[2],ymm5[3],ymm7[3],ymm5[4],ymm7[4],ymm5[5],ymm7[5],ymm5[6],ymm7[6],ymm5[7],ymm7[7],ymm5[16],ymm7[16],ymm5[17],ymm7[17],ymm5[18],ymm7[18],ymm5[19],ymm7[19],ymm5[20],ymm7[20],ymm5[21],ymm7[21],ymm5[22],ymm7[22],ymm5[23],ymm7[23] -; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] -; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm15 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm6[8],xmm15[9],xmm6[9],xmm15[10],xmm6[10],xmm15[11],xmm6[11],xmm15[12],xmm6[12],xmm15[13],xmm6[13],xmm15[14],xmm6[14],xmm15[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm20 -; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm13 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm23, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm9 ^ (zmm18 & (zmm13 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm9 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm24 & (zmm9 ^ zmm13)) -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15] -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm3 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm23, %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm12 & (zmm3 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm12 & (zmm8 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm12 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[4],ymm9[4],ymm12[5],ymm9[5],ymm12[6],ymm9[6],ymm12[7],ymm9[7],ymm12[16],ymm9[16],ymm12[17],ymm9[17],ymm12[18],ymm9[18],ymm12[19],ymm9[19],ymm12[20],ymm9[20],ymm12[21],ymm9[21],ymm12[22],ymm9[22],ymm12[23],ymm9[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,2,3,10,10,10,11] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[1],ymm1[1],ymm6[2],ymm1[2],ymm6[3],ymm1[3],ymm6[4],ymm1[4],ymm6[5],ymm1[5],ymm6[6],ymm1[6],ymm6[7],ymm1[7],ymm6[16],ymm1[16],ymm6[17],ymm1[17],ymm6[18],ymm1[18],ymm6[19],ymm1[19],ymm6[20],ymm1[20],ymm6[21],ymm1[21],ymm6[22],ymm1[22],ymm6[23],ymm1[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm18 & (zmm1 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm9 = mem[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm22 & (zmm9 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm7 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm7[8],ymm5[8],ymm7[9],ymm5[9],ymm7[10],ymm5[10],ymm7[11],ymm5[11],ymm7[12],ymm5[12],ymm7[13],ymm5[13],ymm7[14],ymm5[14],ymm7[15],ymm5[15],ymm7[24],ymm5[24],ymm7[25],ymm5[25],ymm7[26],ymm5[26],ymm7[27],ymm5[27],ymm7[28],ymm5[28],ymm7[29],ymm5[29],ymm7[30],ymm5[30],ymm7[31],ymm5[31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm2 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm5[8],ymm7[8],ymm5[9],ymm7[9],ymm5[10],ymm7[10],ymm5[11],ymm7[11],ymm5[12],ymm7[12],ymm5[13],ymm7[13],ymm5[14],ymm7[14],ymm5[15],ymm7[15],ymm5[24],ymm7[24],ymm5[25],ymm7[25],ymm5[26],ymm7[26],ymm5[27],ymm7[27],ymm5[28],ymm7[28],ymm5[29],ymm7[29],ymm5[30],ymm7[30],ymm5[31],ymm7[31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm18 & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm30[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm22 & (zmm2 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm9)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm17 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm28 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm28 & (zmm12 ^ zmm0)) +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm13 = ymm13[0],ymm0[0],ymm13[1],ymm0[1],ymm13[2],ymm0[2],ymm13[3],ymm0[3],ymm13[4],ymm0[4],ymm13[5],ymm0[5],ymm13[6],ymm0[6],ymm13[7],ymm0[7],ymm13[16],ymm0[16],ymm13[17],ymm0[17],ymm13[18],ymm0[18],ymm13[19],ymm0[19],ymm13[20],ymm0[20],ymm13[21],ymm0[21],ymm13[22],ymm0[22],ymm13[23],ymm0[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,10,10,10,11] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm15[0],ymm2[0],ymm15[1],ymm2[1],ymm15[2],ymm2[2],ymm15[3],ymm2[3],ymm15[4],ymm2[4],ymm15[5],ymm2[5],ymm15[6],ymm2[6],ymm15[7],ymm2[7],ymm15[16],ymm2[16],ymm15[17],ymm2[17],ymm15[18],ymm2[18],ymm15[19],ymm2[19],ymm15[20],ymm2[20],ymm15[21],ymm2[21],ymm15[22],ymm2[22],ymm15[23],ymm2[23] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31,26,27,24,25,22,23,28,29,30,31,30,31,30,31,30,31] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm13 ^ (zmm21 & (zmm2 ^ zmm13)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm29 = zmm20[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm20, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[16],ymm2[16],ymm5[17],ymm2[17],ymm5[18],ymm2[18],ymm5[19],ymm2[19],ymm5[20],ymm2[20],ymm5[21],ymm2[21],ymm5[22],ymm2[22],ymm5[23],ymm2[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm5 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[16],ymm2[16],ymm3[17],ymm2[17],ymm3[18],ymm2[18],ymm3[19],ymm2[19],ymm3[20],ymm2[20],ymm3[21],ymm2[21],ymm3[22],ymm2[22],ymm3[23],ymm2[23] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15],ymm7[24],ymm8[24],ymm7[25],ymm8[25],ymm7[26],ymm8[26],ymm7[27],ymm8[27],ymm7[28],ymm8[28],ymm7[29],ymm8[29],ymm7[30],ymm8[30],ymm7[31],ymm8[31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm6 +; AVX512DQ-FCP-NEXT: vpandnq %zmm6, %zmm21, %zmm25 +; AVX512DQ-FCP-NEXT: vpandq %zmm21, %zmm14, %zmm26 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm27, %zmm27 +; AVX512DQ-FCP-NEXT: vpandnq %zmm11, %zmm27, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm3 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm21 & (zmm3 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm0 & (zmm16 ^ zmm3)) ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,0,1,8,8,8,9] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] -; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm6, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm11 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm6 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3],xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm4 ^ (zmm22 & (zmm10 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm18 & (zmm4 ^ zmm10)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm10 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm10 ^ (zmm22 & (zmm7 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,0,1,8,8,8,9] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4] +; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm5, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,4,5,2,3,8,9,10,11,12,13,10,11] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm20 & (zmm7 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,0,0,1,4,4,4,5] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm10 +; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm6 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm9 ^ (zmm20 & (zmm7 ^ zmm9)) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm5 & (zmm0 ^ zmm7)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) -; AVX512DQ-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm3 = mem[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm5 & (zmm3 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm6 = mem[2,2,2,3,6,6,6,7] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm5 & (zmm6 ^ zmm2)) -; AVX512DQ-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm5 & (zmm2 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%rax) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm27 & (zmm26 | zmm25) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 & (zmm2 | zmm26) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & ~zmm28) | zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0] +; AVX512DQ-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm5 & (zmm16 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = mem[2,2,2,3,6,6,6,7] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm29)) +; AVX512DQ-FCP-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,0,0,1,4,4,4,5] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm7 & (zmm6 ^ zmm3)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,0,1,4,4,4,5] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax) -; AVX512DQ-FCP-NEXT: addq $280, %rsp # imm = 0x118 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-FCP-NEXT: addq $232, %rsp ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm15 ; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm16 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512BW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512BW-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm11, %zmm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm8, %xmm7 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm10 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-NEXT: vprold $16, %xmm10, %xmm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] ; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] -; AVX512BW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512BW-NEXT: vpermt2w %ymm8, %ymm13, %ymm12 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512BW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm20, %zmm14 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%r8), %xmm17 +; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512BW-NEXT: vpermt2w %ymm10, %ymm12, %ymm18 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX512BW-NEXT: vpermt2w %zmm10, %zmm19, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm18, %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%r9), %xmm18 +; AVX512BW-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm10[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm21[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-NEXT: vpermt2w %zmm21, %zmm22, %zmm20 ; AVX512BW-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 ; AVX512BW-NEXT: kmovq %r10, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm3 {%k2} -; AVX512BW-NEXT: vpshufb %xmm5, %xmm15, %xmm14 -; AVX512BW-NEXT: vpshufb %xmm5, %xmm18, %xmm5 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm14 -; AVX512BW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 -; AVX512BW-NEXT: vpshufb %xmm0, %xmm17, %xmm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512BW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vmovdqu8 %zmm20, %zmm3 {%k2} +; AVX512BW-NEXT: vpshufb %xmm5, %xmm15, %xmm20 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm16, %xmm5 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm20[8],xmm5[9],xmm20[9],xmm5[10],xmm20[10],xmm5[11],xmm20[11],xmm5[12],xmm20[12],xmm5[13],xmm20[13],xmm5[14],xmm20[14],xmm5[15],xmm20[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm20 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm14, %xmm11 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-NEXT: vprold $16, %xmm11, %xmm11 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512BW-NEXT: vpermi2w %ymm7, %ymm14, %ymm13 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm7 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm13 -; AVX512BW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k2} -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm20 {%k1} +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm17[0],zero,xmm17[1],zero,xmm17[2],zero,xmm17[3],zero,xmm17[4],zero,xmm17[5],zero,xmm17[6],zero,xmm17[7],zero +; AVX512BW-NEXT: vpermi2w %ymm5, %ymm20, %ymm12 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm19, %zmm20 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm5 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm18[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm11 +; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512BW-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-NEXT: vpermt2w %zmm19, %zmm18, %zmm15 +; AVX512BW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm22, %zmm16 ; AVX512BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} -; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm19 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm16, %ymm19 +; AVX512BW-NEXT: vmovdqa (%r8), %ymm15 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm20, %ymm5, %ymm22 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512BW-NEXT: vpshufb %ymm20, %ymm15, %ymm23 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k3 -; AVX512BW-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vmovdqu16 %ymm23, %ymm19 {%k3} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] -; AVX512BW-NEXT: vpermt2w %ymm22, %ymm23, %ymm15 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512BW-NEXT: vpermt2w %ymm17, %ymm23, %ymm16 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm16, %zmm16 +; AVX512BW-NEXT: vmovdqa64 (%r9), %ymm17 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpshufb %ymm19, %ymm17, %ymm24 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm18 ; AVX512BW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512BW-NEXT: kmovq %r10, %k4 -; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm15 {%k4} -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm16 {%k4} +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm18 ; AVX512BW-NEXT: vmovdqa64 32(%rdx), %ymm24 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm10 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm18[0],ymm24[1],ymm18[1],ymm24[2],ymm18[2],ymm24[3],ymm18[3],ymm24[4],ymm18[4],ymm24[5],ymm18[5],ymm24[6],ymm18[6],ymm24[7],ymm18[7],ymm24[16],ymm18[16],ymm24[17],ymm18[17],ymm24[18],ymm18[18],ymm24[19],ymm18[19],ymm24[20],ymm18[20],ymm24[21],ymm18[21],ymm24[22],ymm18[22],ymm24[23],ymm18[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm8 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm21 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm9[0],ymm21[1],ymm9[1],ymm21[2],ymm9[2],ymm21[3],ymm9[3],ymm21[4],ymm9[4],ymm21[5],ymm9[5],ymm21[6],ymm9[6],ymm21[7],ymm9[7],ymm21[16],ymm9[16],ymm21[17],ymm9[17],ymm21[18],ymm9[18],ymm21[19],ymm9[19],ymm21[20],ymm9[20],ymm21[21],ymm9[21],ymm21[22],ymm9[22],ymm21[23],ymm9[23] ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm18, %zmm4 -; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm4 {%k2} +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm22, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm4 {%k2} ; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 -; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512BW-NEXT: vpshufb %ymm20, %ymm10, %ymm10 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %ymm9, %ymm23, %ymm4 +; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512BW-NEXT: vpshufb %ymm20, %ymm8, %ymm8 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm6 {%k3} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %ymm7, %ymm23, %ymm4 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 ; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm6 ; AVX512BW-NEXT: vpshufb %ymm19, %ymm6, %ymm6 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm9 -; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k4} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm7 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k4} ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %ymm6, %ymm11, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm21, %ymm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512BW-NEXT: vpshufb %ymm0, %ymm22, %ymm10 -; AVX512BW-NEXT: vpshufb %ymm0, %ymm24, %ymm12 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm9, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm21, %ymm8 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[16],ymm7[16],ymm8[17],ymm7[17],ymm8[18],ymm7[18],ymm8[19],ymm7[19],ymm8[20],ymm7[20],ymm8[21],ymm7[21],ymm8[22],ymm7[22],ymm8[23],ymm7[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm9[8],ymm21[9],ymm9[9],ymm21[10],ymm9[10],ymm21[11],ymm9[11],ymm21[12],ymm9[12],ymm21[13],ymm9[13],ymm21[14],ymm9[14],ymm21[15],ymm9[15],ymm21[24],ymm9[24],ymm21[25],ymm9[25],ymm21[26],ymm9[26],ymm21[27],ymm9[27],ymm21[28],ymm9[28],ymm21[29],ymm9[29],ymm21[30],ymm9[30],ymm21[31],ymm9[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm8, %ymm9, %ymm8 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm18, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm24, %ymm10 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[16],ymm8[16],ymm10[17],ymm8[17],ymm10[18],ymm8[18],ymm10[19],ymm8[19],ymm10[20],ymm8[20],ymm10[21],ymm8[21],ymm10[22],ymm8[22],ymm10[23],ymm8[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm24[8],ymm18[8],ymm24[9],ymm18[9],ymm24[10],ymm18[10],ymm24[11],ymm18[11],ymm24[12],ymm18[12],ymm24[13],ymm18[13],ymm24[14],ymm18[14],ymm24[15],ymm18[15],ymm24[24],ymm18[24],ymm24[25],ymm18[25],ymm24[26],ymm18[26],ymm24[27],ymm18[27],ymm24[28],ymm18[28],ymm24[29],ymm18[29],ymm24[30],ymm18[30],ymm24[31],ymm18[31] ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm12, %ymm18, %ymm12 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512BW-NEXT: vpshufb %zmm9, %zmm2, %zmm2 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vpermw %ymm10, %ymm18, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512BW-NEXT: vpshufb %zmm10, %zmm7, %zmm7 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] ; AVX512BW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512BW-NEXT: kmovq %rcx, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} -; AVX512BW-NEXT: vpshufb %ymm6, %ymm16, %ymm1 -; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512BW-NEXT: vpermw %ymm2, %ymm11, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufb %ymm0, %ymm13, %ymm2 -; AVX512BW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] -; AVX512BW-NEXT: vpermw %ymm2, %ymm18, %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} -; AVX512BW-NEXT: vpshufb %zmm9, %zmm5, %zmm1 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm8 {%k3} +; AVX512BW-NEXT: vpshufb %ymm6, %ymm13, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm11, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31] +; AVX512BW-NEXT: vpermw %ymm7, %ymm18, %ymm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb %zmm10, %zmm1, %zmm1 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm8, 320(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -6541,189 +6643,193 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm9 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 ; AVX512BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] -; AVX512BW-FCP-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512BW-FCP-NEXT: vpermt2w %ymm3, %ymm14, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm11, %xmm3 -; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm16 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512BW-FCP-NEXT: vpermt2w %ymm3, %ymm4, %ymm18 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm21 = [8,9,0,0,0,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm9, %xmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm20, %zmm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm18 ; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm12, %xmm4 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm12, %xmm23 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm24 ; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 ; AVX512BW-FCP-NEXT: kmovq %r10, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm16, %xmm22 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm22 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm17, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm5 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX512BW-FCP-NEXT: vpermi2w %ymm7, %ymm5, %ymm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm4, %xmm7 -; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm19, %zmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm23, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm23 +; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm24 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm10, %zmm24 +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm11 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm19, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm16[0],zero,xmm16[1],zero,xmm16[2],zero,xmm16[3],zero,xmm16[4],zero,xmm16[5],zero,xmm16[6],zero,xmm16[7],zero +; AVX512BW-FCP-NEXT: vpermi2w %ymm10, %ymm11, %ymm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm16, %xmm10 +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm11 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm18[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm25, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm4 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm22, %zmm17 ; AVX512BW-FCP-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm17, %ymm19 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm15 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm22 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm23 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] ; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm23, %ymm19 {%k3} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] -; AVX512BW-FCP-NEXT: vpermt2w %ymm22, %ymm23, %ymm17 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 +; AVX512BW-FCP-NEXT: vpermt2w %ymm16, %ymm23, %ymm17 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm17 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm24 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-FCP-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm25, %zmm18 ; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512BW-FCP-NEXT: kmovq %r10, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm17 {%k4} -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm16 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm18 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm24 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm18[0],ymm24[1],ymm18[1],ymm24[2],ymm18[2],ymm24[3],ymm18[3],ymm24[4],ymm18[4],ymm24[5],ymm18[5],ymm24[6],ymm18[6],ymm24[7],ymm18[7],ymm24[16],ymm18[16],ymm24[17],ymm18[17],ymm24[18],ymm18[18],ymm24[19],ymm18[19],ymm24[20],ymm18[20],ymm24[21],ymm18[21],ymm24[22],ymm18[22],ymm24[23],ymm18[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm21 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm18, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm9 {%k2} -; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %ymm10, %ymm23, %ymm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm25, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm6 {%k4} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm21, %ymm11 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm8, %ymm11, %ymm8 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm22, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm24, %ymm12 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm6[0],ymm21[1],ymm6[1],ymm21[2],ymm6[2],ymm21[3],ymm6[3],ymm21[4],ymm6[4],ymm21[5],ymm6[5],ymm21[6],ymm6[6],ymm21[7],ymm6[7],ymm21[16],ymm6[16],ymm21[17],ymm6[17],ymm21[18],ymm6[18],ymm21[19],ymm6[19],ymm21[20],ymm6[20],ymm21[21],ymm6[21],ymm21[22],ymm6[22],ymm21[23],ymm6[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm22, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm7 {%k2} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm7, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm8, %ymm8 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm5 {%k3} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %ymm8, %ymm23, %ymm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm7 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm25, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm5 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm21[8],ymm6[8],ymm21[9],ymm6[9],ymm21[10],ymm6[10],ymm21[11],ymm6[11],ymm21[12],ymm6[12],ymm21[13],ymm6[13],ymm21[14],ymm6[14],ymm21[15],ymm6[15],ymm21[24],ymm6[24],ymm21[25],ymm6[25],ymm21[26],ymm6[26],ymm21[27],ymm6[27],ymm21[28],ymm6[28],ymm21[29],ymm6[29],ymm21[30],ymm6[30],ymm21[31],ymm6[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FCP-NEXT: vpermw %ymm6, %ymm9, %ymm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm18, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm24, %ymm12 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[4],ymm8[4],ymm12[5],ymm8[5],ymm12[6],ymm8[6],ymm12[7],ymm8[7],ymm12[16],ymm8[16],ymm12[17],ymm8[17],ymm12[18],ymm8[18],ymm12[19],ymm8[19],ymm12[20],ymm8[20],ymm12[21],ymm8[21],ymm12[22],ymm8[22],ymm12[23],ymm8[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm18[8],ymm24[9],ymm18[9],ymm24[10],ymm18[10],ymm24[11],ymm18[11],ymm24[12],ymm18[12],ymm24[13],ymm18[13],ymm24[14],ymm18[14],ymm24[15],ymm18[15],ymm24[24],ymm18[24],ymm24[25],ymm18[25],ymm24[26],ymm18[26],ymm24[27],ymm18[27],ymm24[28],ymm18[28],ymm24[29],ymm18[29],ymm24[30],ymm18[30],ymm24[31],ymm18[31] ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512BW-FCP-NEXT: vpermw %ymm12, %ymm18, %ymm12 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k2} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512BW-FCP-NEXT: vpshufb %zmm12, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] ; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31] -; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm11, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] -; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm18, %ymm2 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm8 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] +; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm18, %ymm7 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb %zmm12, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -6731,194 +6837,198 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm15 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm16 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm6 ; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm16 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm17 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm11 -; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm9 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm9, %xmm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm11, %zmm3 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm8 +; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm8, %xmm7 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm9 +; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3],xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm10, %xmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,0,1,4,4,4,5] ; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] -; AVX512DQ-BW-NEXT: vmovdqa %ymm3, %ymm12 -; AVX512DQ-BW-NEXT: vpermt2w %ymm8, %ymm13, %ymm12 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm7 +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vmovdqa64 %ymm3, %ymm18 +; AVX512DQ-BW-NEXT: vpermt2w %ymm10, %ymm12, %ymm18 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero,xmm10[4],zero,xmm10[5],zero,xmm10[6],zero,xmm10[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm19, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm18, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %xmm18 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %xmm10 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm21 = xmm10[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm21[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-NEXT: vpermt2w %zmm21, %zmm22, %zmm20 ; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 ; AVX512DQ-BW-NEXT: kmovq %r10, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm3 {%k2} -; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm18, %xmm5 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm14 -; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 -; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm17, %xmm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm7, %xmm7 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm20, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm15, %xmm20 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm16, %xmm5 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm20[8],xmm5[9],xmm20[9],xmm5[10],xmm20[10],xmm5[11],xmm20[11],xmm5[12],xmm20[12],xmm5[13],xmm20[13],xmm5[14],xmm20[14],xmm5[15],xmm20[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm20 +; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm13, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm14, %xmm11 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm11, %xmm11 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX512DQ-BW-NEXT: vpermi2w %ymm7, %ymm14, %ymm13 -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm7 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm20 {%k1} +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm17[0],zero,xmm17[1],zero,xmm17[2],zero,xmm17[3],zero,xmm17[4],zero,xmm17[5],zero,xmm17[6],zero,xmm17[7],zero +; AVX512DQ-BW-NEXT: vpermi2w %ymm5, %ymm20, %ymm12 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm19, %zmm20 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm12, %zmm5 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm18[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm22, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[16],ymm11[16],ymm12[17],ymm11[17],ymm12[18],ymm11[18],ymm12[19],ymm11[19],ymm12[20],ymm11[20],ymm12[21],ymm11[21],ymm12[22],ymm11[22],ymm12[23],ymm11[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm16 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm17 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm18, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm22, %zmm16 ; AVX512DQ-BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm16, %ymm19 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm15 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512DQ-BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm5, %ymm22 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm15, %ymm23 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] ; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm23, %ymm19 {%k3} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] -; AVX512DQ-BW-NEXT: vpermt2w %ymm22, %ymm23, %ymm15 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vpermt2w %ymm17, %ymm23, %ymm16 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm16, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %ymm17 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512DQ-BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm17, %ymm24 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512DQ-BW-NEXT: vpermt2w %zmm24, %zmm25, %zmm18 ; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512DQ-BW-NEXT: kmovq %r10, %k4 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm15 {%k4} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm18, %zmm16 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm18 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %ymm24 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm18[0],ymm24[1],ymm18[1],ymm24[2],ymm18[2],ymm24[3],ymm18[3],ymm24[4],ymm18[4],ymm24[5],ymm18[5],ymm24[6],ymm18[6],ymm24[7],ymm18[7],ymm24[16],ymm18[16],ymm24[17],ymm18[17],ymm24[18],ymm18[18],ymm24[19],ymm18[19],ymm24[20],ymm18[20],ymm24[21],ymm18[21],ymm24[22],ymm18[22],ymm24[23],ymm18[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm9 ; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm21 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm9[0],ymm21[1],ymm9[1],ymm21[2],ymm9[2],ymm21[3],ymm9[3],ymm21[4],ymm9[4],ymm21[5],ymm9[5],ymm21[6],ymm9[6],ymm21[7],ymm9[7],ymm21[16],ymm9[16],ymm21[17],ymm9[17],ymm21[18],ymm9[18],ymm21[19],ymm9[19],ymm21[20],ymm9[20],ymm21[21],ymm9[21],ymm21[22],ymm9[22],ymm21[23],ymm9[23] ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm18, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm22, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm4 {%k2} ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm10, %ymm10 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %ymm9, %ymm23, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm8, %ymm8 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm8, %ymm6 {%k3} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %ymm7, %ymm23, %ymm4 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm6 ; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k4} ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm11, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm21, %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm22, %ymm10 -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm24, %ymm12 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm9, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm21, %ymm8 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[4],ymm7[4],ymm8[5],ymm7[5],ymm8[6],ymm7[6],ymm8[7],ymm7[7],ymm8[16],ymm7[16],ymm8[17],ymm7[17],ymm8[18],ymm7[18],ymm8[19],ymm7[19],ymm8[20],ymm7[20],ymm8[21],ymm7[21],ymm8[22],ymm7[22],ymm8[23],ymm7[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm9[8],ymm21[9],ymm9[9],ymm21[10],ymm9[10],ymm21[11],ymm9[11],ymm21[12],ymm9[12],ymm21[13],ymm9[13],ymm21[14],ymm9[14],ymm21[15],ymm9[15],ymm21[24],ymm9[24],ymm21[25],ymm9[25],ymm21[26],ymm9[26],ymm21[27],ymm9[27],ymm21[28],ymm9[28],ymm21[29],ymm9[29],ymm21[30],ymm9[30],ymm21[31],ymm9[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm8, %ymm9, %ymm8 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm18, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm24, %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[4],ymm8[4],ymm10[5],ymm8[5],ymm10[6],ymm8[6],ymm10[7],ymm8[7],ymm10[16],ymm8[16],ymm10[17],ymm8[17],ymm10[18],ymm8[18],ymm10[19],ymm8[19],ymm10[20],ymm8[20],ymm10[21],ymm8[21],ymm10[22],ymm8[22],ymm10[23],ymm8[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm24[8],ymm18[8],ymm24[9],ymm18[9],ymm24[10],ymm18[10],ymm24[11],ymm18[11],ymm24[12],ymm18[12],ymm24[13],ymm18[13],ymm24[14],ymm18[14],ymm24[15],ymm18[15],ymm24[24],ymm18[24],ymm24[25],ymm18[25],ymm24[26],ymm18[26],ymm24[27],ymm18[27],ymm24[28],ymm18[28],ymm24[29],ymm18[29],ymm24[30],ymm18[30],ymm24[31],ymm18[31] ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm18, %ymm12 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512DQ-BW-NEXT: vpshufb %zmm9, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm18, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k2} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,2,2,3,6,6,6,7] ; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm16, %ymm1 -; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] -; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm11, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm13, %ymm2 -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] -; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm18, %ymm2 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vpshufb %zmm9, %zmm5, %zmm1 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm8 {%k3} +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm13, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm14, %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm11, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm12, %ymm2 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm12[8],ymm11[8],ymm12[9],ymm11[9],ymm12[10],ymm11[10],ymm12[11],ymm11[11],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15],ymm12[24],ymm11[24],ymm12[25],ymm11[25],ymm12[26],ymm11[26],ymm12[27],ymm11[27],ymm12[28],ymm11[28],ymm12[29],ymm11[29],ymm12[30],ymm11[30],ymm12[31],ymm11[31] +; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm18, %ymm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 320(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -6926,189 +7036,193 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 ; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm3, %ymm14, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm11, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm12, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm3, %ymm4, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm21 = [8,9,0,0,0,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm9, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm20, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm18, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm12, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm12, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm24 ; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm16, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm17, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm7, %ymm5, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm4, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm19, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm23, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm10, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm17, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3],xmm17[4],xmm15[4],xmm17[5],xmm15[5],xmm17[6],xmm15[6],xmm17[7],xmm15[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm19, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm16[0],zero,xmm16[1],zero,xmm16[2],zero,xmm16[3],zero,xmm16[4],zero,xmm16[5],zero,xmm16[6],zero,xmm16[7],zero +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm10, %ymm11, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm16, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm18[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm25, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm4 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm15[8],xmm17[9],xmm15[9],xmm17[10],xmm15[10],xmm17[11],xmm15[11],xmm17[12],xmm15[12],xmm17[13],xmm15[13],xmm17[14],xmm15[14],xmm17[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm22, %zmm17 ; AVX512DQ-BW-FCP-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm17, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm15 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512DQ-BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[2,2,2,3] ; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm23, %ymm19 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm22, %ymm23, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm16, %ymm23, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512DQ-BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm25, %zmm18 ; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm17 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm18[0],ymm24[1],ymm18[1],ymm24[2],ymm18[2],ymm24[3],ymm18[3],ymm24[4],ymm18[4],ymm24[5],ymm18[5],ymm24[6],ymm18[6],ymm24[7],ymm18[7],ymm24[16],ymm18[16],ymm24[17],ymm18[17],ymm24[18],ymm18[18],ymm24[19],ymm18[19],ymm24[20],ymm18[20],ymm24[21],ymm18[21],ymm24[22],ymm18[22],ymm24[23],ymm18[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm21 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm18, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm9 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm10, %ymm23, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm25, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm6 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm21, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm8, %ymm11, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm22, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm24, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm6[0],ymm21[1],ymm6[1],ymm21[2],ymm6[2],ymm21[3],ymm6[3],ymm21[4],ymm6[4],ymm21[5],ymm6[5],ymm21[6],ymm6[6],ymm21[7],ymm6[7],ymm21[16],ymm6[16],ymm21[17],ymm6[17],ymm21[18],ymm6[18],ymm21[19],ymm6[19],ymm21[20],ymm6[20],ymm21[21],ymm6[21],ymm21[22],ymm6[22],ymm21[23],ymm6[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm22, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm7, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm8, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm5 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm8, %ymm23, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm25, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm5 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm21[8],ymm6[8],ymm21[9],ymm6[9],ymm21[10],ymm6[10],ymm21[11],ymm6[11],ymm21[12],ymm6[12],ymm21[13],ymm6[13],ymm21[14],ymm6[14],ymm21[15],ymm6[15],ymm21[24],ymm6[24],ymm21[25],ymm6[25],ymm21[26],ymm6[26],ymm21[27],ymm6[27],ymm21[28],ymm6[28],ymm21[29],ymm6[29],ymm21[30],ymm6[30],ymm21[31],ymm6[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm6, %ymm9, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm18, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm24, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm12[0],ymm8[0],ymm12[1],ymm8[1],ymm12[2],ymm8[2],ymm12[3],ymm8[3],ymm12[4],ymm8[4],ymm12[5],ymm8[5],ymm12[6],ymm8[6],ymm12[7],ymm8[7],ymm12[16],ymm8[16],ymm12[17],ymm8[17],ymm12[18],ymm8[18],ymm12[19],ymm8[19],ymm12[20],ymm8[20],ymm12[21],ymm8[21],ymm12[22],ymm8[22],ymm12[23],ymm8[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm18[8],ymm24[9],ymm18[9],ymm24[10],ymm18[10],ymm24[11],ymm18[11],ymm24[12],ymm18[12],ymm24[13],ymm18[13],ymm24[14],ymm18[14],ymm24[15],ymm18[15],ymm24[24],ymm18[24],ymm24[25],ymm18[25],ymm24[26],ymm18[26],ymm24[27],ymm18[27],ymm24[28],ymm18[28],ymm24[29],ymm18[29],ymm24[30],ymm18[30],ymm24[31],ymm18[31] ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm12, %ymm18, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm12, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] ; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm11, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm18, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm8 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[4],ymm6[4],ymm7[5],ymm6[5],ymm7[6],ymm6[6],ymm7[7],ymm6[7],ymm7[16],ymm6[16],ymm7[17],ymm6[17],ymm7[18],ymm6[18],ymm7[19],ymm6[19],ymm7[20],ymm6[20],ymm7[21],ymm6[21],ymm7[22],ymm6[22],ymm7[23],ymm6[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm9, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[2],ymm7[2],ymm2[3],ymm7[3],ymm2[4],ymm7[4],ymm2[5],ymm7[5],ymm2[6],ymm7[6],ymm2[7],ymm7[7],ymm2[16],ymm7[16],ymm2[17],ymm7[17],ymm2[18],ymm7[18],ymm2[19],ymm7[19],ymm2[20],ymm7[20],ymm2[21],ymm7[21],ymm2[22],ymm7[22],ymm2[23],ymm7[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm18, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm2 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm12, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 61bfee133d84e..d6e58d6eeb857 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1045,20 +1045,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512-NEXT: vporq %zmm2, %zmm3, %zmm2 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 & (zmm3 | zmm2) ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512-NEXT: vmovq %xmm1, 48(%rax) -; AVX512-NEXT: vmovdqa %ymm0, (%rax) +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm0 & ~zmm1) | zmm3 +; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512-NEXT: vmovq %xmm0, 48(%rax) +; AVX512-NEXT: vmovdqa %ymm1, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -1127,20 +1129,22 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,8],zero,zero,ymm4[u,u,u,1,9],zero,zero,ymm4[u,u,u,2,10,18,26,u,u,u],zero,zero,ymm4[19,27,u,u,u],zero,zero,ymm4[20,28] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,5,13],zero,zero,ymm4[u,u,u,6,14],zero,zero,ymm4[u,u,u],zero,zero,ymm4[23,31,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 -; AVX512DQ-NEXT: vporq %zmm2, %zmm3, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 & (zmm3 | zmm2) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u],zero,zero,ymm0[0,u,u,u,u],zero,zero,ymm0[1,u,u,u,u,18,26],zero,ymm0[u,u,u,u,19,27],zero,ymm0[u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,ymm0[4,u,u,u,u],zero,zero,ymm0[5,u,u,u,u],zero,zero,zero,ymm0[u,u,u,u,23,31],zero,ymm0[u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,0,8],zero,ymm1[u,u,u,u,1,9],zero,ymm1[u,u,u,u],zero,zero,ymm1[18,u,u,u,u],zero,zero,ymm1[19,u,u,u,u] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,12],zero,ymm1[u,u,u,u,5,13],zero,ymm1[u,u,u,u,6,14,22,u,u,u,u],zero,zero,ymm1[23,u,u,u,u,u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) -; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) -; AVX512DQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; AVX512DQ-NEXT: vmovq %xmm1, 48(%rax) -; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax) +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm0 & ~zmm1) | zmm3 +; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) +; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0 +; AVX512DQ-NEXT: vmovq %xmm0, 48(%rax) +; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2048,34 +2052,39 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,ymm6[u,u,u,u,u,5],zero,ymm6[u,u,u,u,u,6],zero,ymm6[u,u,u,u,u],zero,ymm6[23,u,u,u,u,u],zero,ymm6[24,u,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm7) -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm7 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm5)) -; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,1,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,3,3,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9,u,u,u],zero,zero,ymm7[2,10,u,u,u],zero,zero,ymm7[3,19,u,u,u],zero,zero,ymm7[28,20,u,u,u],zero,zero,ymm7[29,21,u] -; AVX512-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9],zero,ymm6[u,u,u,u,2,10],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u,20,28],zero,ymm6[u,u,u,u,21] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5)) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[4,u,u,u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = mem & (ymm9 | ymm8) +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8],zero,ymm8[u,u,u,u,1,9],zero,ymm8[u,u,u,u,18,26],zero,ymm8[u,u,u,u,19,27],zero,ymm8[u,u,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,0,4,4,5,4] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm9 & mem) | zmm8 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm9 & ~zmm7) | zmm6 +; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[3,1,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,3,3,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ~ymm9 & (ymm5 | ymm8) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm6 & ymm9) ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] @@ -2086,9 +2095,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -2115,34 +2124,39 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm8 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm8, %zmm9 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] +; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4],zero,ymm5[u,u,u,u,1,5],zero,ymm5[u,u,u,u,2,6],zero,ymm5[u,u,u,u,19,23],zero,ymm5[u,u,u,u,24,28],zero,ymm5[u] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm8 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,0,8],zero,ymm10[u,u,u,u,1,9],zero,ymm10[u,u,u,u,18,26],zero,ymm10[u,u,u,u,19,27],zero,ymm10[u,u,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm9 & mem) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm7 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm5 & ~zmm7) | zmm6 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[3,1,1,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,3,3,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] +; AVX512-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,6] ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512-FCP-NEXT: vpandn %ymm6, %ymm9, %ymm6 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,1,9],zero,ymm8[u,u,u,u,2,10],zero,ymm8[u,u,u,u,19,27],zero,ymm8[u,u,u,u,20,28],zero,ymm8[u,u,u,u,21] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm6) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm5 & ymm9) ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] @@ -2153,9 +2167,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512-FCP-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -2185,34 +2199,39 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,2,0,2] ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,u,u,1,9],zero,zero,ymm6[u,u,u,2,10],zero,zero,ymm6[u,u,u,19,27],zero,zero,ymm6[u,u,u,20,28],zero,zero ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6 -; AVX512DQ-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm6 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,ymm6[u,u,u,u,u,5],zero,ymm6[u,u,u,u,u,6],zero,ymm6[u,u,u,u,u],zero,ymm6[23,u,u,u,u,u],zero,ymm6[24,u,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4,u,u,u,u,u],zero,ymm8[5,u,u,u,u,u],zero,ymm8[6,u,u,u,u,u,23],zero,ymm8[u,u,u,u,u,24],zero,ymm8[u,u] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = mem & (ymm8 | ymm7) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,0,8],zero,ymm7[u,u,u,u,1,9],zero,ymm7[u,u,u,u,18,26],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & mem) | zmm7 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm5)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[3,1,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[1,3,3,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9,u,u,u],zero,zero,ymm7[2,10,u,u,u],zero,zero,ymm7[3,19,u,u,u],zero,zero,ymm7[28,20,u,u,u],zero,zero,ymm7[29,21,u] -; AVX512DQ-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9],zero,ymm6[u,u,u,u,2,10],zero,ymm6[u,u,u,u,19,27],zero,ymm6[u,u,u,u,20,28],zero,ymm6[u,u,u,u,21] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5)) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[4],zero,ymm5[u,u,u,u,u,5],zero,ymm5[u,u,u,u,u,6],zero,ymm5[u,u,u,u,u],zero,ymm5[23,u,u,u,u,u],zero,ymm5[24,u,u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[4,u,u,u,u,u],zero,ymm9[5,u,u,u,u,u],zero,ymm9[6,u,u,u,u,u,23],zero,ymm9[u,u,u,u,u,24],zero,ymm9[u,u] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = mem & (ymm9 | ymm8) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,0,2] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8],zero,ymm8[u,u,u,u,1,9],zero,ymm8[u,u,u,u,18,26],zero,ymm8[u,u,u,u,19,27],zero,ymm8[u,u,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm9 = zmm9[0,0,1,0,4,4,5,4] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = (zmm9 & mem) | zmm8 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm9 & ~zmm7) | zmm6 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[3,1,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,ymm6[u,u,u,10,2],zero,zero,ymm6[u,u,u,11,3],zero,zero,ymm6[u,u,u,20,28],zero,zero,ymm6[u,u,u,21,29],zero,zero,ymm6[u] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm3[1,3,3,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] +; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-NEXT: vpandn %ymm8, %ymm9, %ymm8 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[1,3,1,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,1,9],zero,ymm5[u,u,u,u,2,10],zero,ymm5[u,u,u,u,19,27],zero,ymm5[u,u,u,u,20,28],zero,ymm5[u,u,u,u,21] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ~ymm9 & (ymm5 | ymm8) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm6 & ymm9) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] @@ -2223,9 +2242,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-NEXT: vmovdqa %ymm6, 64(%rax) ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-NEXT: vmovdqa %ymm5, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -2252,34 +2271,39 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,5],zero,zero,ymm6[u,u,u,2,6],zero,zero,ymm6[u,u,u,19,23],zero,zero,ymm6[u,u,u,24,28],zero,zero,ymm6[u,u,u,25] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 -; AVX512DQ-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm7, %zmm8 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,9,2,10,1,9,2,10] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,4],zero,ymm6[u,u,u,u,1,5],zero,ymm6[u,u,u,u,2,6],zero,ymm6[u,u,u,u,19,23],zero,ymm6[u,u,u,u,24,28],zero,ymm6[u] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm7 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,2,0,2] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,0,8],zero,ymm9[u,u,u,u,1,9],zero,ymm9[u,u,u,u,18,26],zero,ymm9[u,u,u,u,19,27],zero,ymm9[u,u,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm8 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 & (zmm6 | zmm5) +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm8, %zmm9 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,2,10,1,9,2,10] +; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm1, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,4],zero,ymm5[u,u,u,u,1,5],zero,ymm5[u,u,u,u,2,6],zero,ymm5[u,u,u,u,19,23],zero,ymm5[u,u,u,u,24,28],zero,ymm5[u] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm8 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,0,8],zero,ymm10[u,u,u,u,1,9],zero,ymm10[u,u,u,u,18,26],zero,ymm10[u,u,u,u,19,27],zero,ymm10[u,u,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm5 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm9 & mem) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm7 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm5 & ~zmm7) | zmm6 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[3,1,1,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[1],zero,zero,ymm5[u,u,u,10,2],zero,zero,ymm5[u,u,u,11,3],zero,zero,ymm5[u,u,u,20,28],zero,zero,ymm5[u,u,u,21,29],zero,zero,ymm5[u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm4[1,3,3,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[1,9,u,u,u],zero,zero,ymm8[2,10,u,u,u],zero,zero,ymm8[3,19,u,u,u],zero,zero,ymm8[28,20,u,u,u],zero,zero,ymm8[29,21,u] -; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[1,3,3,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,9,u,u,u],zero,zero,ymm6[2,10,u,u,u],zero,zero,ymm6[3,19,u,u,u],zero,zero,ymm6[28,20,u,u,u],zero,zero,ymm6[29,21,u] +; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,5,6] ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,1,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,1,9],zero,ymm7[u,u,u,u,2,10],zero,ymm7[u,u,u,u,19,27],zero,ymm7[u,u,u,u,20,28],zero,ymm7[u,u,u,u,21] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5)) +; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpandn %ymm6, %ymm9, %ymm6 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[1,3,1,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,1,9],zero,ymm8[u,u,u,u,2,10],zero,ymm8[u,u,u,u,19,27],zero,ymm8[u,u,u,u,20,28],zero,ymm8[u,u,u,u,21] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ~ymm9 & (ymm8 | ymm6) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm5 & ymm9) ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] @@ -2290,9 +2314,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[13,u,u,u,u],zero,zero,xmm0[14,u,u,u,u],zero,zero,xmm0[15] ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm3)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -2302,9 +2326,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 -; AVX512BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512BW-NEXT: vmovdqa (%r10), %xmm4 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] @@ -2322,7 +2346,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512BW-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero @@ -2331,46 +2355,46 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-NEXT: vpermw %zmm2, %zmm8, %zmm8 +; AVX512BW-NEXT: vpermw %zmm4, %zmm8, %zmm8 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k1} ; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,7,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] -; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero -; AVX512BW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm4 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[1,3,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %ymm4, %ymm7, %ymm7 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[3,19],zero,zero,zero,zero,zero,ymm0[28,20],zero,zero,zero,zero,zero,ymm0[29,21],zero -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,1,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1],zero,zero,zero,zero,zero,ymm1[10,2],zero,zero,zero,zero,zero,ymm1[11,3],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero,zero,zero,zero,ymm1[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,3,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[3,1,1,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} -; AVX512BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,7,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] +; AVX512BW-NEXT: movw $-32510, %cx # imm = 0x8102 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqu8 %xmm3, %xmm2 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[6,14],zero,zero,zero,zero,zero,xmm1[7,15],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,14],zero,zero,zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512BW-NEXT: kmovd %ecx, %k1 +; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512BW-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -2432,8 +2456,8 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] ; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 -; AVX512BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -2444,9 +2468,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm4 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,ymm0[6],zero,zero,zero,zero,zero,zero,zero,ymm0[23],zero,zero,zero,zero,zero,zero,ymm0[24],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm0[2,3,0,1] @@ -2464,7 +2488,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512DQ-BW-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero @@ -2473,46 +2497,46 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm8, %zmm8 +; AVX512DQ-BW-NEXT: vpermw %zmm4, %zmm8, %zmm8 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k1} ; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,7,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,3,2] -; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm4, %xmm3 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm1[1,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[6,14],zero,zero,zero,zero,zero,xmm7[7,15],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm4 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] -; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm3, %ymm2 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm6[1,3,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6] +; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %ymm4, %ymm7, %ymm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] ; AVX512DQ-BW-NEXT: movl $67637280, %ecx # imm = 0x4081020 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,3,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm0[1,9],zero,zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,zero,ymm0[3,19],zero,zero,zero,zero,zero,ymm0[28,20],zero,zero,zero,zero,zero,ymm0[29,21],zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,1,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[1],zero,zero,zero,zero,zero,ymm1[10,2],zero,zero,zero,zero,zero,ymm1[11,3],zero,zero,zero,zero,zero,ymm1[20,28],zero,zero,zero,zero,zero,ymm1[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[1,3,3,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[2,10],zero,zero,zero,zero,zero,ymm7[3,19],zero,zero,zero,zero,zero,ymm7[28,20],zero,zero,zero,zero,zero,ymm7[29,21],zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm1[3,1,1,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa %ymm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa %xmm4, 96(%rax) +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u] +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,7,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,2] +; AVX512DQ-BW-NEXT: movw $-32510, %cx # imm = 0x8102 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm3, %xmm2 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[6,14],zero,zero,zero,zero,zero,xmm1[7,15],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[6,14],zero,zero,zero,zero,zero,xmm0[7,15],zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3 +; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-BW-NEXT: vmovdqa %ymm7, 64(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq @@ -2574,8 +2598,8 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] ; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -4073,134 +4097,152 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vmovdqa (%r9), %ymm5 -; AVX512-NEXT: vmovdqa64 (%r10), %ymm18 -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX512-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1] -; AVX512-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6,u,u] +; AVX512-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-NEXT: vmovdqa64 (%r10), %ymm17 +; AVX512-NEXT: vmovdqa (%r9), %xmm4 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm4[4,u,u,u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6] +; AVX512-NEXT: vmovdqa (%r8), %xmm7 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[4],zero,xmm7[u,u,u,7],zero,xmm7[5],zero,xmm7[u,u,u,8],zero,xmm7[6],zero +; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,0,1],zmm9[0,1,0,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vpandq %zmm11, %zmm9, %zmm9 +; AVX512-NEXT: vmovdqa (%r10), %xmm10 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,1,0,4,4,5,4] +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm12 & ~zmm15) | zmm9 +; AVX512-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX512-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm9[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] ; AVX512-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm11[0,1,0,1],zmm10[0,1,0,1] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) -; AVX512-NEXT: vmovdqa (%r9), %xmm10 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] -; AVX512-NEXT: vmovdqa (%r8), %xmm11 -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero -; AVX512-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm7[0,1,0,1] -; AVX512-NEXT: vmovdqa (%r10), %xmm12 -; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512-NEXT: vpermq {{.*#+}} zmm17 = zmm0[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm16)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25] -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm8[18,19,20,21],zero,ymm8[19],zero,ymm8[25,26,27,22],zero,ymm8[20],zero -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm15[2,3,2,3] -; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm16 -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[2,3,2,3],zmm1[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm15[2,3,2,3] -; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] -; AVX512-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm9[0,1,0,1],zmm7[0,1,0,1] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm16)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm15 ^ (mem & (zmm20 ^ zmm15)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm15[2,3,2,3],zmm7[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23,u,u,u],zero,ymm5[26],zero,ymm5[24,u,u,u],zero,ymm5[27],zero +; AVX512-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[2,3,2,3],zmm15[2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 & (zmm8 | zmm7) +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20],zero,zero +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm7[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] ; AVX512-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[2,3,2,3],zmm2[2,3,2,3] -; AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-NEXT: vpermi2d %zmm2, %zmm7, %zmm15 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] +; AVX512-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm0 & ~zmm1) | zmm8 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,ymm6[20,21,20,21],zero,ymm6[19],zero,ymm6[19,20,21,22],zero +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[2,3,2,3],zmm7[2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 & (zmm7 | zmm0) +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,4,0,5,7,4,5,20,21,0,23,20,21,22,23] +; AVX512-NEXT: vpermi2d %zmm0, %zmm15, %zmm16 +; AVX512-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm15 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm16 & ~zmm15) | zmm7 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm1)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14,u,u],zero,zero,zero,zero,ymm3[15,u,u],zero,zero,zero,zero,ymm3[16,u,u],zero,zero,zero,zero,ymm3[17,u,u],zero,zero,zero,zero,ymm3[18] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[u,u,0,1,14,15],zero,ymm6[u,u,13,2,3,16],zero,ymm6[u,u,28,29,16,17],zero,ymm6[u,u,19,28,29,18],zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u,u,u] +; AVX512-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] +; AVX512-NEXT: vmovdqa %ymm5, %ymm3 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u],zero,ymm13[14,u,u,u,u,u],zero,ymm13[15,u,u,u,u,u],zero,ymm13[16,u,u,u,u,u],zero,ymm13[17,u,u,u,u,u] -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,14],zero,ymm8[u,u,u,u,u,15],zero,ymm8[u,u,u,u,u,16],zero,ymm8[u,u,u,u,u,17],zero,ymm8[u,u,u,u,u] -; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u],zero,zero,zero,zero,ymm5[14],zero,ymm5[u],zero,zero,zero,zero,ymm5[15],zero,ymm5[u],zero,zero,zero,zero,ymm5[16],zero,ymm5[u],zero,zero,zero,zero,ymm5[17],zero,ymm5[u],zero,zero -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm13[30],zero,ymm13[28,u,u,u],zero,ymm13[31],zero,ymm13[29,u] +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm5[14,u,u],zero,zero,zero,zero,ymm5[15,u,u],zero,zero,zero,zero,ymm5[16,u,u],zero,zero,zero,zero,ymm5[17,u,u],zero,zero,zero,zero,ymm5[18] +; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,14],zero,ymm4[u,u,0,1,14,15],zero,ymm4[u,u,13,2,3,16],zero,ymm4[u,u,28,29,16,17],zero,ymm4[u,u,19,28,29,18],zero +; AVX512-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm0 & ~zmm7) | zmm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm2[u],zero,zero,zero,zero,ymm2[14],zero,ymm2[u],zero,zero,zero,zero,ymm2[15],zero,ymm2[u],zero,zero,zero,zero,ymm2[16],zero,ymm2[u],zero,zero,zero,zero,ymm2[17],zero,ymm2[u],zero,zero +; AVX512-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm6[13,u,u,u,u],zero,zero,ymm6[14,u,u,u,u],zero,zero,ymm6[15,u,u,u,u],zero,zero,ymm6[16,u,u,u,u],zero,zero,ymm6[17,u,u] +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29] +; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm0 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm6[28],zero,ymm6[30,31,30,31],zero,ymm6[29],zero,ymm6[31,28,29] -; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27,u,u,u],zero,ymm9[30],zero,ymm9[28,u,u,u],zero,ymm9[31],zero -; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm8[30],zero,ymm8[28,u,u,u],zero,ymm8[31],zero,ymm8[29,u] +; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm1)) -; AVX512-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm12 & (ymm3 ^ ymm0)) +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27,u,u,u],zero,ymm9[30],zero,ymm9[28,u,u,u],zero,ymm9[31],zero +; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm3)) +; AVX512-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -4208,129 +4250,147 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[2,3,2,3],zmm7[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] -; AVX512-FCP-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] -; AVX512-FCP-NEXT: vporq %zmm8, %zmm9, %zmm8 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[2,3,2,3],zmm7[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] -; AVX512-FCP-NEXT: vporq %zmm7, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,21,20,0,21,0,20,0,4,5,0,7,0,5,0,7] -; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm7 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] -; AVX512-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %ymm17 +; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm3 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm11 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[4,u,u,u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6] +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm9 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[4],zero,xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero +; AVX512-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm12[0,1,0,1],zmm8[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-FCP-NEXT: vpandq %zmm12, %zmm8, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm11 & ~zmm15) | zmm8 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm8[0,1,0,1] ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,0,1],zmm11[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) -; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm11 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,17,0,17,0,16,16,0,0,0,0,0,2,3,0,1] -; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm13, %zmm17 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm13 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero -; AVX512-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm17 ^ (mem & (zmm8 ^ zmm17)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm8[0,1,0,1],zmm7[0,1,0,1] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm16)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm15 ^ (mem & (zmm20 ^ zmm15)) +; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm15[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero +; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm15[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm15 & (zmm0 | zmm7) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20],zero,zero +; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm1 & ~zmm2) | zmm0 +; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm7 & (zmm1 | zmm0) +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,21,20,0,21,0,20,0,4,5,0,7,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm15, %zmm16 +; AVX512-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm16 & ~zmm15) | zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm2)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,14],zero,ymm8[u,u,u,u,u,15],zero,ymm8[u,u,u,u,u,16],zero,ymm8[u,u,u,u,u,17],zero,ymm8[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm1, %ymm12, %ymm1 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm12 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm12[u],zero,zero,zero,zero,ymm12[14],zero,ymm12[u],zero,zero,zero,zero,ymm12[15],zero,ymm12[u],zero,zero,zero,zero,ymm12[16],zero,ymm12[u],zero,zero,zero,zero,ymm12[17],zero,ymm12[u],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] +; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm11[14,u,u],zero,zero,zero,zero,ymm11[15,u,u],zero,zero,zero,zero,ymm11[16,u,u],zero,zero,zero,zero,ymm11[17,u,u],zero,zero,zero,zero,ymm11[18] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512-FCP-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm0 & ~zmm2) | zmm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u],zero,zero,zero,zero,ymm5[14],zero,ymm5[u],zero,zero,zero,zero,ymm5[15],zero,ymm5[u],zero,zero,zero,zero,ymm5[16],zero,ymm5[u],zero,zero,zero,zero,ymm5[17],zero,ymm5[u],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] +; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) -; AVX512-FCP-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm7 & (ymm2 ^ ymm0)) +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27,u,u,u],zero,ymm10[30],zero,ymm10[28,u,u,u],zero,ymm10[31],zero +; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512-FCP-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -4338,134 +4398,152 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %ymm19 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 -; AVX512DQ-NEXT: vmovdqa64 (%r10), %ymm18 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u],zero,xmm0[7],zero,xmm0[5,u,u,u],zero,xmm0[8],zero,xmm0[6,u,u,u],zero -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u,u,9] -; AVX512DQ-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6,u,u] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm5 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm0 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-NEXT: vmovdqa64 (%r10), %ymm17 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm4[4,u,u,u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6] +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[4],zero,xmm7[u,u,u,7],zero,xmm7[5],zero,xmm7[u,u,u,8],zero,xmm7[6],zero +; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm18 +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,0,1],zmm9[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vpandq %zmm11, %zmm9, %zmm9 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm10 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,1,0,4,4,5,4] +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm11, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm12 & ~zmm15) | zmm9 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512DQ-NEXT: vpor %xmm9, %xmm12, %xmm9 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm9[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm12 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm11[0,1,0,1],zmm10[0,1,0,1] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm10 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm10[4,u,u,u],zero,xmm10[7],zero,xmm10[5,u,u,u],zero,xmm10[8],zero,xmm10[6] -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm11 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero -; AVX512DQ-NEXT: vpor %xmm7, %xmm12, %xmm7 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm12 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm17 = zmm0[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm16)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25] -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[18],zero,zero,zero,zero,ymm7[21],zero,ymm7[19],zero,zero,zero,zero,ymm7[22],zero,ymm7[20] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm8[18,19,20,21],zero,ymm8[19],zero,ymm8[25,26,27,22],zero,ymm8[20],zero -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm16 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero,zero -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm15[2,3,2,3],zmm1[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] -; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512DQ-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm9[0,1,0,1],zmm7[0,1,0,1] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm16)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm15 ^ (mem & (zmm20 ^ zmm15)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] +; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm15[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[23,u,u,u],zero,ymm5[26],zero,ymm5[24,u,u,u],zero,ymm5[27],zero +; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm15 & (zmm8 | zmm7) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20],zero,zero +; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm5 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm15[2,3,2,3],zmm2[2,3,2,3] -; AVX512DQ-NEXT: vporq %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm4 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm7, %zmm15 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] +; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm4 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm0 & ~zmm1) | zmm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,ymm6[20,21,20,21],zero,ymm6[19],zero,ymm6[19,20,21,22],zero +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 & (zmm7 | zmm0) +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,4,0,5,7,4,5,20,21,0,23,20,21,22,23] +; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm15, %zmm16 +; AVX512DQ-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm8, %zmm15 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm16 & ~zmm15) | zmm7 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm1)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm3[14,u,u],zero,zero,zero,zero,ymm3[15,u,u],zero,zero,zero,zero,ymm3[16,u,u],zero,zero,zero,zero,ymm3[17,u,u],zero,zero,zero,zero,ymm3[18] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[u,u,0,1,14,15],zero,ymm6[u,u,13,2,3,16],zero,ymm6[u,u,28,29,16,17],zero,ymm6[u,u,19,28,29,18],zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm8 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] +; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm3 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u],zero,ymm13[14,u,u,u,u,u],zero,ymm13[15,u,u,u,u,u],zero,ymm13[16,u,u,u,u,u],zero,ymm13[17,u,u,u,u,u] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,14],zero,ymm8[u,u,u,u,u,15],zero,ymm8[u,u,u,u,u,16],zero,ymm8[u,u,u,u,u,17],zero,ymm8[u,u,u,u,u] -; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u],zero,zero,zero,zero,ymm5[14],zero,ymm5[u],zero,zero,zero,zero,ymm5[15],zero,ymm5[u],zero,zero,zero,zero,ymm5[16],zero,ymm5[u],zero,zero,zero,zero,ymm5[17],zero,ymm5[u],zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm9 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm1)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm13[30],zero,ymm13[28,u,u,u],zero,ymm13[31],zero,ymm13[29,u] +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm5 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm5[14,u,u],zero,zero,zero,zero,ymm5[15,u,u],zero,zero,zero,zero,ymm5[16,u,u],zero,zero,zero,zero,ymm5[17,u,u],zero,zero,zero,zero,ymm5[18] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[0,1,14],zero,ymm4[u,u,0,1,14,15],zero,ymm4[u,u,13,2,3,16],zero,ymm4[u,u,28,29,16,17],zero,ymm4[u,u,19,28,29,18],zero +; AVX512DQ-NEXT: vpor %ymm1, %ymm7, %ymm1 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm7, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm7 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512DQ-NEXT: vpandq %zmm7, %zmm1, %zmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = (zmm0 & ~zmm7) | zmm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm2[u],zero,zero,zero,zero,ymm2[14],zero,ymm2[u],zero,zero,zero,zero,ymm2[15],zero,ymm2[u],zero,zero,zero,zero,ymm2[16],zero,ymm2[u],zero,zero,zero,zero,ymm2[17],zero,ymm2[u],zero,zero +; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm9 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm11 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm6[13,u,u,u,u],zero,zero,ymm6[14,u,u,u,u],zero,zero,ymm6[15,u,u,u,u],zero,zero,ymm6[16,u,u,u,u],zero,zero,ymm6[17,u,u] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm4[28],zero,ymm4[30,31,30,31],zero,ymm4[29],zero,ymm4[31,28,29] +; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm0 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm6[28],zero,ymm6[30,31,30,31],zero,ymm6[29],zero,ymm6[31,28,29] -; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27,u,u,u],zero,ymm9[30],zero,ymm9[28,u,u,u],zero,ymm9[31],zero -; AVX512DQ-NEXT: vpor %ymm0, %ymm3, %ymm0 -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm8[30],zero,ymm8[28,u,u,u],zero,ymm8[31],zero,ymm8[29,u] +; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm1)) -; AVX512DQ-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm12 & (ymm3 ^ ymm0)) +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm9[27,u,u,u],zero,ymm9[30],zero,ymm9[28,u,u,u],zero,ymm9[31],zero +; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm3)) +; AVX512DQ-NEXT: vmovdqa %ymm2, 192(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4473,129 +4551,147 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[23,u,u,u],zero,ymm4[26],zero,ymm4[24,u,u,u],zero,ymm4[27],zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm5[18,19,20,21],zero,ymm5[19],zero,ymm5[25,26,27,22],zero,ymm5[20],zero -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm8, %zmm7 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20],zero,zero -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[2,3,2,3],zmm8[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm5[23],zero,ymm5[21,22,23,26],zero,ymm5[24],zero,ymm5[28,29,26,27] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm3[19],zero,ymm3[21,20,21,22],zero,ymm3[20],zero,ymm3[22,23] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-FCP-NEXT: vporq %zmm8, %zmm9, %zmm8 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[2,3,2,3],zmm7[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm1[18],zero,ymm1[20,21,20,21],zero,ymm1[19],zero,ymm1[19,20,21,22],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3] -; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,21,20,0,21,0,20,0,4,5,0,7,0,5,0,7] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm11, %zmm7 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] -; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm11[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %ymm17 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm3 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,17,0,17,0,16,16,0,0,1,0,1,2,3,0,1] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[4,u,u,u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6] +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[4],zero,xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero +; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm12[0,1,0,1],zmm8[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-FCP-NEXT: vpandq %zmm12, %zmm8, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm12, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm11 & ~zmm15) | zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u,u,9] +; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm12[0,1,0,1],zmm8[0,1,0,1] ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm14 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] -; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm13[0,1,0,1],zmm11[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm11 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,17,0,17,0,16,16,0,0,0,0,0,2,3,0,1] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm13, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm13 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm13[4,u,u,u],zero,xmm13[7],zero,xmm13[5,u,u,u],zero,xmm13[8],zero,xmm13[6] -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[4],zero,xmm15[u,u,u,7],zero,xmm15[5],zero,xmm15[u,u,u,8],zero,xmm15[6],zero -; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm17 ^ (mem & (zmm8 ^ zmm17)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm16)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm8[0,1,0,1],zmm7[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm16)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm15 ^ (mem & (zmm20 ^ zmm15)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm15[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm10[23,u,u,u],zero,ymm10[26],zero,ymm10[24,u,u,u],zero,ymm10[27],zero +; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm15 & (zmm0 | zmm7) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20],zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm21 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm8[23],zero,ymm8[21,22,23,26],zero,ymm8[24],zero,ymm8[28,29,26,27] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm2[19],zero,ymm2[21,20,21,22],zero,ymm2[20],zero,ymm2[22,23] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm1 & ~zmm2) | zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm5 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm4[23],zero,ymm4[23,24,25,26],zero,ymm4[24],zero,ymm4[30,31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm4[18],zero,ymm4[20,21,20,21],zero,ymm4[19],zero,ymm4[19,20,21,22],zero +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm7 & (zmm1 | zmm0) +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,21,20,0,21,0,20,0,4,5,0,7,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm15, %zmm16 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = (zmm16 & ~zmm15) | zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,14],zero,ymm8[u,u,u,u,u,15],zero,ymm8[u,u,u,u,u,16],zero,ymm8[u,u,u,u,u,17],zero,ymm8[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm12, %ymm1 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm9, %zmm1 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm12 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm12[u],zero,zero,zero,zero,ymm12[14],zero,ymm12[u],zero,zero,zero,zero,ymm12[15],zero,ymm12[u],zero,zero,zero,zero,ymm12[16],zero,ymm12[u],zero,zero,zero,zero,ymm12[17],zero,ymm12[u],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm0 -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm11 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm11[13,u,u,u,u],zero,zero,ymm11[14,u,u,u,u],zero,zero,ymm11[15,u,u,u,u],zero,zero,ymm11[16,u,u,u,u],zero,zero,ymm11[17,u,u] -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm0 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm11[14,u,u],zero,zero,zero,zero,ymm11[15,u,u],zero,zero,zero,zero,ymm11[16,u,u],zero,zero,zero,zero,ymm11[17,u,u],zero,zero,zero,zero,ymm11[18] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero +; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpandq %zmm2, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = (zmm0 & ~zmm2) | zmm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm5[u],zero,zero,zero,zero,ymm5[14],zero,ymm5[u],zero,zero,zero,zero,ymm5[15],zero,ymm5[u],zero,zero,zero,zero,ymm5[16],zero,ymm5[u],zero,zero,zero,zero,ymm5[17],zero,ymm5[u],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27,u,u,u],zero,ymm2[30],zero,ymm2[28,u,u,u],zero,ymm2[31],zero +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm4[13,u,u,u,u],zero,zero,ymm4[14,u,u,u,u],zero,zero,ymm4[15,u,u,u,u],zero,zero,ymm4[16,u,u,u,u],zero,zero,ymm4[17,u,u] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u],zero,ymm6[30],zero,ymm6[28,u,u,u],zero,ymm6[31],zero,ymm6[29,u] +; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm0)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (mem & (ymm2 ^ ymm1)) -; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm7 & (ymm2 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27,u,u,u],zero,ymm10[30],zero,ymm10[28,u,u,u],zero,ymm10[31],zero +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm0)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2)) +; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -4603,71 +4699,71 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm7 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[0,1,14],zero,ymm13[12,13,0,1,14,15],zero,ymm13[3,12,13,2,3,16],zero,ymm13[30,31,28,29,16,17],zero,ymm13[31,18,19,28,29,18],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] -; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 -; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm0, %zmm5 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm14 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 -; AVX512BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm10 +; AVX512BW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm12 +; AVX512BW-NEXT: movabsq $2323999253380730912, %r8 # imm = 0x2040810204081020 +; AVX512BW-NEXT: kmovq %r8, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm12 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm15 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] +; AVX512BW-NEXT: vpor %ymm5, %ymm13, %ymm5 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm16, %zmm5 +; AVX512BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm5 {%k1} ; AVX512BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vmovdqu8 %zmm12, %zmm5 {%k1} +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,1,1,4,4,5,5] ; AVX512BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero ; AVX512BW-NEXT: vporq %ymm15, %ymm16, %ymm15 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm14[2,3,2,3],zmm15[2,3,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm14 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm14[18,19,20,21],zero,zmm14[19],zero,zmm14[25,26,27,22],zero,zmm14[20],zero,zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero -; AVX512BW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm16 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm12[2,3,2,3],zmm15[2,3,2,3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm12 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm12 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm12[18,19,20,21],zero,zmm12[19],zero,zmm12[25,26,27,22],zero,zmm12[20],zero,zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm12[55],zero,zero,zero,zero,zmm12[58],zero,zmm12[56],zero,zero,zero,zero,zmm12[59],zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm16 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] -; AVX512BW-NEXT: vporq %zmm14, %zmm16, %zmm14 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vporq %zmm12, %zmm16, %zmm12 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] ; AVX512BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k2} +; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm12 {%k2} ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 +; AVX512BW-NEXT: vpermw %zmm0, %zmm15, %zmm15 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512BW-NEXT: vporq %zmm16, %zmm17, %zmm16 @@ -4677,67 +4773,67 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512BW-NEXT: kmovq %rcx, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm14 {%k2} -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,3,3,6,6,7,7] +; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm12 {%k2} +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX512BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 ; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm9 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,2,3] +; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm8 {%k1} +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,3,2,3] ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,0,1],zmm7[0,1,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6,u,u,u],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] -; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm9[0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] +; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,0,1],zmm7[0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,0,1],zmm8[0,1,0,1] ; AVX512BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero -; AVX512BW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,0,1],zmm4[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm8 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm3[4,u,u,u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[4],zero,xmm4[u,u,u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero +; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm7[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-NEXT: vpermw %zmm0, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm8 {%k1} +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] ; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512BW-NEXT: kmovd %ecx, %k1 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm6 {%k1} ; AVX512BW-NEXT: vmovdqa %ymm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -4745,135 +4841,135 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm8 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm9 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %r8 # imm = 0x2040810204081020 +; AVX512BW-FCP-NEXT: kmovq %r8, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm11, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] +; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512BW-FCP-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm13 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 -; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm7 {%k1} ; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm15, %zmm10 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm13 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22],zero,zmm13[20],zero,zero,zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm13, %zmm10 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm13 = zmm10[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] ; AVX512BW-FCP-NEXT: vporq %zmm10, %zmm16, %zmm10 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512BW-FCP-NEXT: vporq %zmm16, %zmm17, %zmm16 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm16 {%k1} ; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm15[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm16, %xmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,0,1],zmm13[0,1,0,1] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero,xmm11[u,u,u,9] ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm14[0,1,0,1] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,0,1],zmm14[0,1,0,1] ; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,0,1],zmm9[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[4],zero,xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero +; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm12[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} ; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm11 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero +; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] -; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm3 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm4, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512BW-FCP-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512BW-FCP-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 192(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -4881,71 +4977,71 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm9 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm7 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm9 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm4 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[0,1,14],zero,ymm13[12,13,0,1,14,15],zero,ymm13[3,12,13,2,3,16],zero,ymm13[30,31,28,29,16,17],zero,ymm13[31,18,19,28,29,18],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm12 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm8, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm11 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 -; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm10 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm4 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm12 +; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %r8 # imm = 0x2040810204081020 +; AVX512DQ-BW-NEXT: kmovq %r8, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm12 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[0,1,0,1,14],zero,ymm6[14,15,0,1,14,15],zero,ymm6[13,14,15,16,17,16],zero,ymm6[30,31,30,31,16,17],zero,ymm6[31,28,29,30,31] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm13, %zmm15 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm13, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm14 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm16, %zmm5 +; AVX512DQ-BW-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm12, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm12 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-NEXT: movl $676341840, %ecx # imm = 0x28502850 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm12 {%k1} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm6[23],zero,ymm6[21,22,23,26],zero,ymm6[24],zero,ymm6[28,29,26,27] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm9[25],zero,ymm9[23],zero,zero,zero,zero,ymm9[26],zero,ymm9[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm16 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %ymm15, %ymm16, %ymm15 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm14[2,3,2,3],zmm15[2,3,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm14 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm14[18,19,20,21],zero,zmm14[19],zero,zmm14[25,26,27,22],zero,zmm14[20],zero,zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero,zmm14[59],zero -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm16 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm12[2,3,2,3],zmm15[2,3,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm12 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm12 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm12[18,19,20,21],zero,zmm12[19],zero,zmm12[25,26,27,22],zero,zmm12[20],zero,zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm12[55],zero,zero,zero,zero,zmm12[58],zero,zmm12[56],zero,zero,zero,zero,zmm12[59],zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm16 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] -; AVX512DQ-BW-NEXT: vporq %zmm14, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vporq %zmm12, %zmm16, %zmm12 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm12 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm15, %zmm15 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm16, %zmm17, %zmm16 @@ -4955,67 +5051,67 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512DQ-BW-NEXT: kmovq %rcx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm14 {%k2} -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm12 {%k2} +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX512DQ-BW-NEXT: movl $338170920, %ecx # imm = 0x14281428 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm13[2,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k2} = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm9[2,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm9 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm8 {%k1} +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,3,2,3] ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u] -; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm9[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u],zero,xmm4[7],zero,xmm4[5,u,u,u],zero,xmm4[8],zero,xmm4[6,u,u,u],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero,xmm3[u,u,u,9] -; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm9[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero,xmm13[u,u] +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm8[0,1,0,1],zmm7[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6,u,u,u],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,7],zero,xmm10[5],zero,xmm10[u,u,u,8],zero,xmm10[6],zero,xmm10[u,u,u,9] +; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm9[0,1,0,1],zmm8[0,1,0,1] ; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4],zero,xmm11[u,u,u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero -; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm7[0,1,0,1],zmm4[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm3[4,u,u,u],zero,xmm3[7],zero,xmm3[5,u,u,u],zero,xmm3[8],zero,xmm3[6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[4],zero,xmm4[u,u,u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero +; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm9, %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm7[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} ; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpermw %ymm5, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm8 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm3, %ymm0 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] ; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-BW-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax) ; AVX512DQ-BW-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm6 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -5023,135 +5119,135 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[0,1,0,1,14],zero,ymm1[14,15,0,1,14,15],zero,ymm1[13,14,15,16,17,16],zero,ymm1[30,31,30,31,16,17],zero,ymm1[31,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[0,1,14],zero,ymm3[12,13,0,1,14,15],zero,ymm3[3,12,13,2,3,16],zero,ymm3[30,31,28,29,16,17],zero,ymm3[31,18,19,28,29,18],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18] -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm0, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[13],zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10 +; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %r8 # imm = 0x2040810204081020 +; AVX512DQ-BW-FCP-NEXT: kmovq %r8, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm7, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm11, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] +; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7 ; AVX512DQ-BW-FCP-NEXT: movabsq $435749858791416001, %rcx # imm = 0x60C1830183060C1 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[13],zero,zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %rcx # imm = 0x2040810204081020 -; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm7 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %rcx # imm = 0x3870E1C3870E1C38 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,zmm10[19],zero,zmm10[21,20,21,22],zero,zmm10[20],zero,zmm10[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57],zero,zmm10[55],zero,zmm10[53,54,55,58],zero,zmm10[56],zero,zmm10[60,61,58,59] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm15 = zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm15[21],zero,zmm15[19],zero,zero,zero,zero,zmm15[22],zero,zmm15[20],zero,zero,zmm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm15[57],zero,zmm15[55],zero,zero,zero,zero,zmm15[58],zero,zmm15[56],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm15, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm10[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm13 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22],zero,zmm13[20],zero,zero,zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm13, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm13 = zmm10[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm10 = zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zmm10[18,19,20,21],zero,zmm10[19],zero,zmm10[25,26,27,22],zero,zmm10[20],zero,zmm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm10[55],zero,zero,zero,zero,zmm10[58],zero,zmm10[56],zero,zero,zero,zero,zmm10[59],zero -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[18],zero,zero,zero,zero,zmm16[21],zero,zmm16[19],zero,zero,zero,zero,zmm16[22],zero,zmm16[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm16[55],zero,zero,zero,zero,zmm16[58],zero,zmm16[56],zero,zero,zero,zero,zmm16[59],zero,zmm16[57] ; AVX512DQ-BW-FCP-NEXT: vporq %zmm10, %zmm16, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm10 = zmm10[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: movabsq $-9005497107459067808, %rcx # imm = 0x83060C180C183060 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm6[18],zero,zmm6[20,21,20,21],zero,zmm6[19],zero,zmm6[19,20,21,22],zero,zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm6[55],zero,zmm6[55,56,57,58],zero,zmm6[56],zero,zmm6[62,63] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[20],zero,zmm5[18],zero,zero,zero,zero,zmm5[21],zero,zmm5[19],zero,zero,zero,zero,zmm5[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[57],zero,zmm5[55],zero,zero,zero,zero,zmm5[58],zero,zmm5[56],zero,zero +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm16 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,zmm2[18],zero,zmm2[20,21,20,21],zero,zmm2[19],zero,zmm2[19,20,21,22],zero,zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,56,57],zero,zmm2[55],zero,zmm2[55,56,57,58],zero,zmm2[56],zero,zmm2[62,63] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %zmm16, %zmm17, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm16 = zmm16[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rcx # imm = 0x1020408102040810 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm16 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rcx # imm = 0x1C3870E1C3870E1C ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u],zero,xmm14[7],zero,xmm14[5,u,u,u],zero,xmm14[8],zero,xmm14[6,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[u,u,u,7],zero,xmm12[5],zero,xmm12[u,u,u,8],zero,xmm12[6],zero,xmm12[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3],xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm15[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u],zero,xmm9[7],zero,xmm9[5,u,u,u],zero,xmm9[8],zero,xmm9[6,u,u,u],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm16, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,0,1],zmm13[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6,u,u,u],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,7],zero,xmm11[5],zero,xmm11[u,u,u,8],zero,xmm11[6],zero,xmm11[u,u,u,9] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm14[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm11[0,1,0,1],zmm14[0,1,0,1] ; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rcx # imm = 0xC183060C183060C ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm11[4,u,u,u],zero,xmm11[7],zero,xmm11[5,u,u,u],zero,xmm11[8],zero,xmm11[6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[0,1,0,1],zmm9[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm7, %zmm11, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm8[4,u,u,u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[4],zero,xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm12[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rcx # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rcx # imm = 0x70E1C3870E1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm8 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm3[28],zero,ymm3[30,31,30,31],zero,ymm3[29],zero,ymm3[31,28,29] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm8, %zmm11 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,30],zero,ymm5[28],zero,ymm5[30,31,30,31],zero,ymm5[29],zero,ymm5[31,28,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm4[30],zero,ymm4[28],zero,zero,zero,zero,ymm4[31],zero,ymm4[29],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[30],zero,ymm3[28],zero,zero,zero,zero,ymm3[31],zero,ymm3[29],zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] -; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm5[27],zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero,ymm6[29] -; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15] +; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm4, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm1[27],zero,zero,zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero,ymm2[29] +; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX512DQ-BW-FCP-NEXT: movl $-2130574328, %ecx # imm = 0x81020408 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rax) ; AVX512DQ-BW-FCP-NEXT: movl $-507279602, %ecx # imm = 0xE1C3870E ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -8372,2551 +8468,2721 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; ; AVX512-LABEL: store_i8_stride7_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $1256, %rsp # imm = 0x4E8 -; AVX512-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512-NEXT: vpshufb %ymm5, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512-NEXT: vpshufb %ymm4, %ymm10, %ymm1 +; AVX512-NEXT: subq $1592, %rsp # imm = 0x638 +; AVX512-NEXT: vmovdqa (%r8), %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] +; AVX512-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[12,13,0,1,14,15],zero,ymm6[3,12,13,2,3,16],zero,ymm6[30,31,28,29,16,17],zero,ymm6[31,18,19,28,29,18],zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512-NEXT: vpshufb %ymm3, %ymm13, %ymm2 -; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %ymm14 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512-NEXT: vmovdqa (%r9), %ymm15 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm6, %ymm15, %ymm7 -; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm9, %ymm21 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm7 -; AVX512-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512-NEXT: vporq %ymm0, %ymm7, %ymm31 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm9, %ymm13, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm10, %ymm11, %ymm7 -; AVX512-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm7 -; AVX512-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm11, %ymm15, %ymm13 -; AVX512-NEXT: vmovdqa64 %ymm15, %ymm24 -; AVX512-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm13 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm12 -; AVX512-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm13 -; AVX512-NEXT: vmovdqa64 %ymm14, %ymm25 -; AVX512-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] -; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm11 -; AVX512-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] -; AVX512-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa 32(%rax), %ymm14 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] +; AVX512-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm3, %ymm5, %ymm1 +; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm12[18,19,20,21],zero,ymm12[19],zero,ymm12[25,26,27,22],zero,ymm12[20],zero +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm1 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm12 +; AVX512-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] +; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm9, %ymm14, %ymm13 +; AVX512-NEXT: vpor %ymm12, %ymm13, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31] +; AVX512-NEXT: vpor %ymm12, %ymm13, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] -; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512-NEXT: vpshufb %ymm8, %ymm9, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[2,3,2,3],zmm12[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpshufb %ymm2, %ymm9, %ymm0 -; AVX512-NEXT: vpshufb %ymm6, %ymm10, %ymm1 -; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm10 +; AVX512-NEXT: vmovdqa64 %ymm7, %ymm27 +; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm11 +; AVX512-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512-NEXT: vpor %ymm10, %ymm11, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm6 +; AVX512-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm8, %ymm7 +; AVX512-NEXT: vmovdqa64 %ymm8, %ymm25 +; AVX512-NEXT: vpor %ymm6, %ymm7, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm18, %ymm13 +; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm6 +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm7 +; AVX512-NEXT: vmovdqa64 %ymm20, %ymm17 +; AVX512-NEXT: vpor %ymm6, %ymm7, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,1,0,1,2,3,0,1,18,18,19,19,18,0,19,19] +; AVX512-NEXT: vmovdqa 32(%rax), %xmm12 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpermi2d %zmm6, %zmm8, %zmm28 +; AVX512-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %xmm7 -; AVX512-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm6 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,6] -; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vmovdqa64 %xmm4, %xmm28 -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm20 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX512-NEXT: vporq %xmm1, %xmm0, %xmm23 -; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm26 -; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm25 -; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm10 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512-NEXT: vpor %xmm9, %xmm10, %xmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm9 +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm10 +; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm4 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] +; AVX512-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm5 +; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] +; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512-NEXT: vmovdqa64 %ymm11, %ymm29 +; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512-NEXT: vpor %ymm3, %ymm2, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm3 +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm4 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u] +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%r9), %xmm5 +; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm6 +; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25] +; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm1 +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero +; AVX512-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] +; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm1 +; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm29 +; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31] +; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512-NEXT: vpshufb %ymm2, %ymm13, %ymm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512-NEXT: vmovdqa (%rax), %ymm7 -; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm17 -; AVX512-NEXT: vmovdqa (%rax), %xmm1 -; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,6] +; AVX512-NEXT: vmovdqa (%rax), %ymm4 +; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm1[0,1,0,1] +; AVX512-NEXT: vmovdqa (%rax), %xmm7 +; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,5,6] ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512-NEXT: vmovdqa %xmm9, %xmm12 -; AVX512-NEXT: vmovdqa64 %xmm6, %xmm19 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm31[2,3,2,3],zmm3[0,1,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512-NEXT: vmovdqa %xmm10, %xmm9 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; AVX512-NEXT: vpandnq %ymm2, %ymm17, %ymm2 +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm4[13],zero,zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero +; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 +; AVX512-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[2,3,2,3],zmm4[0,1,0,1] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-NEXT: vpshufb %ymm5, %ymm14, %ymm5 -; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14 -; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm14 -; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[2,3,2,3],zmm2[0,1,0,1] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload -; AVX512-NEXT: # zmm29 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm31 ^ (zmm8 & (zmm16 ^ zmm31)) -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512-NEXT: # ymm3 = mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512-NEXT: # ymm31 = mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm3 ^ (zmm8 & (zmm31 ^ zmm3)) -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512-NEXT: # zmm3 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm4[0,1,0,1],zmm10[0,1,0,1] -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,0,1],zmm12[0,1,0,1] -; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload -; AVX512-NEXT: # zmm13 = zmm13[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] -; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm23 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm23 & (zmm11 ^ zmm13)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (zmm23 & (zmm12 ^ zmm10)) +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm0[2,3,2,3],zmm2[0,1,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm4[2,3,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,0] +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm5[0,1,0,1] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm0[0,1,0,1],zmm5[0,1,0,1] +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[0,1,0,1],zmm1[0,1,0,1] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = zmm0[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512-NEXT: # zmm26 = zmm0[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX512-NEXT: vpandnq %zmm11, %zmm0, %zmm11 +; AVX512-NEXT: vpandq %zmm0, %zmm26, %zmm0 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm26 & (zmm0 | zmm11) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = zmm1[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpandnq %zmm11, %zmm26, %zmm11 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm1 & (zmm11 | zmm0) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512-NEXT: # zmm1 = (mem & ~zmm1) | zmm11 +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm9 +; AVX512-NEXT: vpandq %zmm0, %zmm22, %zmm4 +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm9 & ~zmm0) | zmm4 +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm4 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm4 & zmm17) +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512-NEXT: vpandq %zmm4, %zmm2, %zmm2 +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm28 & ~zmm4) | zmm2 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm3[0,1,0,1] +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero +; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm13 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: # zmm9 = zmm9[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,0,0,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512-NEXT: # zmm13 = zmm13[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512-NEXT: # zmm14 = zmm14[0,1,0,1],mem[0,1,0,1] +; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm19 = zmm21 ^ (zmm16 & (zmm19 ^ zmm21)) ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[1,1,0,0,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm25[2,3,2,3],zmm26[2,3,2,3] -; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm21[2,3,2,3],zmm22[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm13 ^ (zmm23 & (zmm15 ^ zmm13)) +; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512-NEXT: # ymm17 = mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 +; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm24, %zmm18 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm16 & (zmm18 ^ zmm17)) +; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,0,4,4,5,4] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm9)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm15 ^ (zmm26 & (zmm2 ^ zmm15)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload +; AVX512-NEXT: # zmm9 = zmm6[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm11 # 64-byte Folded Reload +; AVX512-NEXT: # zmm11 = zmm29[2,3,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm9 ^ (zmm26 & (zmm11 ^ zmm9)) ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) -; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm6, %zmm5 # 32-byte Folded Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,0,1,0] -; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm16)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm11)) -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm4 & mem) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm31)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm29)) -; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 32-byte Folded Reload +; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm5)) ; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm20 ^ (mem & (zmm9 ^ zmm20)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm12)) -; AVX512-NEXT: vporq %zmm27, %zmm24, %zmm3 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm3)) -; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm2 & mem) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm18)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm7 ^ (mem & (zmm14 ^ zmm7)) +; AVX512-NEXT: vporq %zmm31, %zmm27, %zmm2 +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm11)) ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX512-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512-NEXT: addq $1592, %rsp # imm = 0x638 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i8_stride7_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm1 -; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm9 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512-FCP-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23,u,u,u],zero,ymm11[26],zero,ymm11[24,u,u,u],zero,ymm11[27],zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 -; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] -; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm5 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero,ymm15[27],zero,ymm15[25] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm12 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm5 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31] -; AVX512-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm15 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm13 -; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[2,3,2,3],zmm13[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm8 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm19 -; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm0 +; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm6, %zmm0 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm7 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[2,3,2,3],zmm7[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm7 -; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm20 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm3 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm7 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm10 +; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512-FCP-NEXT: vpor %ymm7, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm13 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm13 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm26 -; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm0 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,0,6,7,4,5,6,7,0,17,0,17,0,16,16,0] +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm0, %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm12 +; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm6 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512-FCP-NEXT: vpor %xmm6, %xmm9, %xmm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm23 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm23 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm0 +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,0,1,2,3,0,1,18,18,19,19,18,0,19,19] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm9 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512-FCP-NEXT: vpor %xmm0, %xmm9, %xmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm6 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm8 -; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm9 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm1[18,19,20,21],zero,ymm1[19],zero,ymm1[25,26,27,22],zero,ymm1[20],zero +; AVX512-FCP-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero,zero,ymm11[18] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero -; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm4 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 -; AVX512-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm3[18],zero,ymm3[20,21,20,21],zero,ymm3[19],zero,ymm3[19,20,21,22],zero -; AVX512-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] -; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm2 +; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[13,u,u,u,u,u],zero,ymm5[14,u,u,u,u,u],zero,ymm5[15,u,u,u,u,u],zero,ymm5[16,u,u,u,u,u],zero,ymm5[17,u,u,u] +; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 +; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7] +; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm19 +; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm29 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm10 -; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[1,1,0,0,4,5,6,7] -; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm18 -; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512-FCP-NEXT: vporq %xmm3, %xmm4, %xmm26 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25] -; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero -; AVX512-FCP-NEXT: vporq %ymm3, %ymm4, %ymm25 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm24 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm23 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero -; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm22 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm12 +; AVX512-FCP-NEXT: vporq %xmm9, %xmm12, %xmm30 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[23,u,u,u],zero,ymm14[26],zero,ymm14[24,u,u,u],zero,ymm14[27],zero +; AVX512-FCP-NEXT: vporq %ymm9, %ymm12, %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] -; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm9 +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm8 +; AVX512-FCP-NEXT: vporq %ymm9, %ymm8, %ymm22 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm7[23],zero,ymm7[21,22,23,26],zero,ymm7[24],zero,ymm7[28,29,26,27] +; AVX512-FCP-NEXT: vporq %ymm8, %ymm9, %ymm21 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero +; AVX512-FCP-NEXT: vporq %ymm8, %ymm7, %ymm20 +; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm8[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] +; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm8[2,3,2,3],zmm7[2,3,2,3] +; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,2,3,3,2,2,3,3] +; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; AVX512-FCP-NEXT: vpandnq %ymm0, %ymm28, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm14 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm10 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm3[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm12 +; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] -; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512-FCP-NEXT: vpermd %ymm13, %ymm28, %ymm13 -; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm6 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm26[0,1,0,1] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm24[2,3,2,3],zmm25[2,3,2,3] -; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm22[2,3,2,3],zmm23[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm12 & (zmm7 ^ zmm5)) -; AVX512-FCP-NEXT: vporq %zmm20, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm5)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm0[0,1,0,1],zmm1[0,1,0,1] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm1 +; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm9 +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm2 = zmm2[0,1,0,1],mem[0,1,0,1] +; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm30[0,1,0,1] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm22[2,3,2,3],zmm24[2,3,2,3] +; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm20[2,3,2,3],zmm21[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm11 & (zmm5 ^ zmm4)) +; AVX512-FCP-NEXT: vporq %zmm18, %zmm17, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm7)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm17 ^ (zmm12 & (zmm31 ^ zmm17)) -; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm21 ^ (zmm7 & (zmm16 ^ zmm21)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX512-FCP-NEXT: vpandnq %zmm0, %zmm5, %zmm0 +; AVX512-FCP-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm11 & (zmm4 | zmm0) +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,0,1] +; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,0,1] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm8 = zmm8[2,3,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpandnq %zmm8, %zmm11, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 & (zmm8 | zmm4) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm12 +; AVX512-FCP-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm12 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm12 = (mem & ~zmm12) | zmm8 +; AVX512-FCP-NEXT: vpandq %zmm4, %zmm31, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512-FCP-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm4 = (mem & ~zmm4) | zmm8 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm26 ^ (zmm11 & (zmm14 ^ zmm26)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm28) +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512-FCP-NEXT: vpandq %zmm0, %zmm27, %zmm8 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm0 +; AVX512-FCP-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm0 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm0 = (mem & ~zmm0) | zmm8 +; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm23 ^ (zmm8 & (zmm10 ^ zmm23)) ; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512-FCP-NEXT: # ymm11 = mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512-FCP-NEXT: # ymm12 = mem[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (zmm7 & (zmm12 ^ zmm11)) -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4)) -; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm3 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm4 ^ (mem & (zmm30 ^ zmm4)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm16)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (mem & (zmm9 ^ zmm1)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31)) -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3] -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm12)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm18 ^ (mem & (zmm2 ^ zmm18)) -; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6)) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512-FCP-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 +; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512-FCP-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm11 ^ (zmm8 & (zmm15 ^ zmm11)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm19 ^ (zmm13 & (zmm1 ^ zmm19)) +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 ^ (mem & (zmm6 ^ zmm7)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm10)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm14)) +; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[2,3,2,3] +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm15)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) +; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i8_stride7_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $1256, %rsp # imm = 0x4E8 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm10, %ymm1 +; AVX512DQ-NEXT: subq $1592, %rsp # imm = 0x638 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm7 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero,zero,zero,ymm7[18] +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[12,13,0,1,14,15],zero,ymm6[3,12,13,2,3,16],zero,ymm6[30,31,28,29,16,17],zero,ymm6[31,18,19,28,29,18],zero ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm11 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm26 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm2 -; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm29 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm14 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm0 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm15 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0] -; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm15, %ymm7 -; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm21 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm18 -; AVX512DQ-NEXT: vporq %ymm0, %ymm7, %ymm31 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm13, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm19 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm11, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm16 -; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] -; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm7 -; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm17 -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] -; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm15, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm24 -; AVX512DQ-NEXT: vpor %ymm7, %ymm13, %ymm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm7 -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm7, %ymm13 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm8 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm12 -; AVX512DQ-NEXT: vpor %ymm13, %ymm12, %ymm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128] -; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm13 -; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm25 -; AVX512DQ-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm12 -; AVX512DQ-NEXT: vpshufb %ymm9, %ymm12, %ymm9 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-NEXT: vpshufb %ymm10, %ymm13, %ymm10 -; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm12[23],zero,ymm12[21,22,23,26],zero,ymm12[24],zero,ymm12[28,29,26,27] -; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm9, %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm10 -; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm11 -; AVX512DQ-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm9[23],zero,ymm9[23,24,25,26],zero,ymm9[24],zero,ymm9[30,31] -; AVX512DQ-NEXT: vpor %ymm0, %ymm11, %ymm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm14 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm2 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm5 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] +; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm3, %ymm5, %ymm1 +; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm3, %ymm12, %ymm1 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm2[23],zero,ymm2[21,22,23,26],zero,ymm2[24],zero,ymm2[28,29,26,27] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm12[18,19,20,21],zero,ymm12[19],zero,ymm12[25,26,27,22],zero,ymm12[20],zero +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero,ymm2[27],zero,ymm2[25] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm0 -; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm22 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm12 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm14 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] +; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm14, %ymm13 +; AVX512DQ-NEXT: vpor %ymm12, %ymm13, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31] +; AVX512DQ-NEXT: vpor %ymm12, %ymm13, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22] -; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] -; AVX512DQ-NEXT: # ymm8 = mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm9, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm13[2,3,2,3],zmm12[2,3,2,3] ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpshufb %ymm2, %ymm9, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm6, %ymm10, %ymm1 -; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm10 +; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm27 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm24 +; AVX512DQ-NEXT: vpor %ymm10, %ymm11, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm6 +; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21 +; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm8, %ymm7 +; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm25 +; AVX512DQ-NEXT: vpor %ymm6, %ymm7, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13 +; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm6 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm7 +; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm17 +; AVX512DQ-NEXT: vpor %ymm6, %ymm7, %ymm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm1 +; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm3 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm30 -; AVX512DQ-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm31 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18 +; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm6 +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm20 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm30 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm7 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm19 +; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,1,0,1,2,3,0,1,18,18,19,19,18,0,19,19] +; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm12 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm8, %zmm28 +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm0 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQ-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm6 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm7 -; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm10 -; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm4 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm13 -; AVX512DQ-NEXT: vpshufb %xmm5, %xmm13, %xmm5 -; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm2 -; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,6] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm28 -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm20 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm15 -; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm11 -; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm0 -; AVX512DQ-NEXT: vporq %xmm1, %xmm0, %xmm23 -; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero,ymm3[25] -; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1 -; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm26 -; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 -; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm1 -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm25 -; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero -; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm3 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm22 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero -; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm21 -; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm1 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm9 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm10 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512DQ-NEXT: vpor %xmm9, %xmm10, %xmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm9 +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm10 +; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20] +; AVX512DQ-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm4 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128] +; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm5 +; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero +; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3 +; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm29 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpor %ymm3, %ymm2, %ymm0 +; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm3 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm4 +; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16 +; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[13,u,u,u,u,u],zero,ymm14[14,u,u,u,u,u],zero,ymm14[15,u,u,u,u,u],zero,ymm14[16,u,u,u,u,u],zero,ymm14[17,u,u,u] +; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm1 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm0 +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm1 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm1 +; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27],zero,ymm4[25] +; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm1 +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] +; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm1 +; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm29 +; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[2,3,2,3],zmm0[2,3,2,3] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31] +; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2 +; AVX512DQ-NEXT: vpshufb %ymm2, %ymm13, %ymm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm1[23],zero,ymm1[23,24,25,26],zero,ymm1[24],zero,ymm1[30,31] -; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm1[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm7 -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm17 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm1 -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,5,6] +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm4 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm8 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm23 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm1[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm7 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5,5,6] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm3 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm19 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm31[2,3,2,3],zmm3[0,1,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3],xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] -; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm9 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; AVX512DQ-NEXT: vpandnq %ymm2, %ymm17, %ymm2 +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm4[13],zero,zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm20 +; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm13 +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm6 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[2,3,2,3],zmm4[0,1,0,1] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-NEXT: vpshufb %ymm5, %ymm14, %ymm5 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm14[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2] -; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm14 -; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0 +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm0[2,3,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm29 = zmm0[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm6 -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm31 ^ (zmm8 & (zmm16 ^ zmm31)) -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm3 = mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm31, %zmm3 -; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload -; AVX512DQ-NEXT: # ymm31 = mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm3 ^ (zmm8 & (zmm31 ^ zmm3)) -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm3 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] -; AVX512DQ-NEXT: vpshufb %xmm10, %xmm8, %xmm8 -; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm0 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm4[0,1,0,1],zmm10[0,1,0,1] -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm4 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm5, %xmm5 -; AVX512DQ-NEXT: vpshufb %xmm13, %xmm12, %xmm12 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm4[0,1,0,1],zmm12[0,1,0,1] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm11[8],xmm15[8],xmm11[9],xmm15[9],xmm11[10],xmm15[10],xmm11[11],xmm15[11],xmm11[12],xmm15[12],xmm11[13],xmm15[13],xmm11[14],xmm15[14],xmm11[15],xmm15[15] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm6, %xmm6 -; AVX512DQ-NEXT: vpshufb %xmm11, %xmm9, %xmm9 -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm13 = zmm13[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm11)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm11 = zmm11[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm23 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm23 & (zmm11 ^ zmm13)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (zmm23 & (zmm12 ^ zmm10)) +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm0[2,3,2,3],zmm2[0,1,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm4[2,3,2,3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,2,0] +; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm0[2,3,2,3],zmm5[0,1,0,1] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm16 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm12, %xmm12 +; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm5 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm0[0,1,0,1],zmm5[0,1,0,1] +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm6[8],xmm13[9],xmm6[9],xmm13[10],xmm6[10],xmm13[11],xmm6[11],xmm13[12],xmm6[12],xmm13[13],xmm6[13],xmm13[14],xmm6[14],xmm13[15],xmm6[15] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm6 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = zmm0[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm26 = zmm0[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX512DQ-NEXT: vpandnq %zmm11, %zmm0, %zmm11 +; AVX512DQ-NEXT: vpandq %zmm0, %zmm26, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm26 & (zmm0 | zmm11) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = zmm1[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpandnq %zmm11, %zmm26, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm1 & (zmm11 | zmm0) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm1 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm1 = (mem & ~zmm1) | zmm11 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm9 +; AVX512DQ-NEXT: vpandq %zmm0, %zmm22, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm9 & ~zmm0) | zmm4 +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm4 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm4 & zmm17) +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-NEXT: vpandq %zmm4, %zmm2, %zmm2 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm4, %zmm4 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm28 & ~zmm4) | zmm2 +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm3[0,1,0,1] +; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,ymm9[13],zero,zero,zero,zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm9 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm13 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm13 +; AVX512DQ-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm9 = zmm9[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,0,0,4,5,6,7] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm7, %zmm7 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm13 = zmm13[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm6 +; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7] +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm14 = zmm14[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm21 ^ (zmm16 & (zmm19 ^ zmm21)) ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4 -; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[1,1,0,0,4,5,6,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,0] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm25[2,3,2,3],zmm26[2,3,2,3] -; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm15 = zmm21[2,3,2,3],zmm22[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm13 ^ (zmm23 & (zmm15 ^ zmm13)) +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] +; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm17 = mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 +; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload +; AVX512DQ-NEXT: # ymm18 = mem[2,3,2,3] +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm24, %zmm18 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm16 & (zmm18 ^ zmm17)) +; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm7[0,0,1,0,4,4,5,4] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm11 & (zmm7 ^ zmm9)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm15 ^ (zmm26 & (zmm2 ^ zmm15)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm9 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm9 = zmm6[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm11 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm11 = zmm29[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm9 ^ (zmm26 & (zmm11 ^ zmm9)) ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) -; AVX512DQ-NEXT: vinserti64x4 $1, (%rsp), %zmm6, %zmm5 # 32-byte Folded Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm5 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0)) -; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm10[0,0,1,0] -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm16)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm11)) -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload -; AVX512DQ-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm4 & mem) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm31)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm29)) -; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm3)) +; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm9 # 32-byte Folded Reload +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm5)) ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm20 ^ (mem & (zmm9 ^ zmm20)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm12)) -; AVX512DQ-NEXT: vporq %zmm27, %zmm24, %zmm3 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm3)) -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (mem & (zmm17 ^ zmm15)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2)) +; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload +; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-NEXT: # zmm2 = zmm2[0,1,2,3],mem[2,3,2,3] +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm2 & mem) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm18)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm7 ^ (mem & (zmm14 ^ zmm7)) +; AVX512DQ-NEXT: vporq %zmm31, %zmm27, %zmm2 +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm11)) ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 256(%rax) ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax) -; AVX512DQ-NEXT: addq $1256, %rsp # imm = 0x4E8 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) +; AVX512DQ-NEXT: addq $1592, %rsp # imm = 0x638 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $1448, %rsp # imm = 0x5A8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18] -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,14],zero,ymm2[12,13,0,1,14,15],zero,ymm2[3,12,13,2,3,16],zero,ymm2[30,31,28,29,16,17],zero,ymm2[31,18,19,28,29,18],zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24 -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm9 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[13,u,u,u,u,u],zero,ymm9[14,u,u,u,u,u],zero,ymm9[15,u,u,u,u,u],zero,ymm9[16,u,u,u,u,u],zero,ymm9[17,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm11 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FCP-NEXT: subq $1496, %rsp # imm = 0x5D8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm3 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm17 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm11[23,u,u,u],zero,ymm11[26],zero,ymm11[24,u,u,u],zero,ymm11[27],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm3[23],zero,ymm3[21,22,23,26],zero,ymm3[24],zero,ymm3[28,29,26,27] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm19 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0] -; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3 -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] -; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128] +; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29] +; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm5 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm15[23],zero,zero,zero,zero,ymm15[26],zero,ymm15[24],zero,zero,zero,zero,ymm15[27],zero,ymm15[25] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29] ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm5 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128] ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22 -; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm3[23],zero,ymm3[23,24,25,26],zero,ymm3[24],zero,ymm3[30,31] -; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26 +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm15 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] -; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm15[2,3,2,3],zmm13[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm7 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm8[2,3,2,3],zmm7[2,3,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm19 -; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm17 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[13,u,u,u,u,u],zero,ymm3[14,u,u,u,u,u],zero,ymm3[15,u,u,u,u,u],zero,ymm3[16,u,u,u,u,u],zero,ymm3[17,u,u,u] +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm14 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28 +; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm18 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm13 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11 +; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm13 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm6, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm20 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm11 = [4,0,6,7,4,5,6,7,0,17,0,17,0,16,16,0] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm9 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm0, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm17 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm16 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm30 -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm28 -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm9, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm23 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm18 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,0,1,2,3,0,1,18,18,19,19,18,0,19,19] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm9, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm16 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm9, %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm31 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm9 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm9, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm8 -; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm1[18,19,20,21],zero,ymm1[19],zero,ymm1[25,26,27,22],zero,ymm1[20],zero +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm4 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero -; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20 +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] +; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero,zero,ymm11[18] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[0,1,14],zero,ymm5[12,13,0,1,14,15],zero,ymm5[3,12,13,2,3,16],zero,ymm5[30,31,28,29,16,17],zero,ymm5[31,18,19,28,29,18],zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128] +; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm2 +; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[13,u,u,u,u,u],zero,ymm5[14,u,u,u,u,u],zero,ymm5[15,u,u,u,u,u],zero,ymm5[16,u,u,u,u,u],zero,ymm5[17,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128] -; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1 -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23] -; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24 -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm3[18],zero,ymm3[20,21,20,21],zero,ymm3[19],zero,ymm3[19,20,21,22],zero -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13,u,u,u,u,u],zero,ymm2[14,u,u,u,u,u],zero,ymm2[15,u,u,u,u,u],zero,ymm2[16,u,u,u,u,u],zero,ymm2[17,u,u,u] -; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12 -; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8 -; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 +; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm1 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm19 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm29 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm3 +; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm2 -; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[1,1,0,0,4,5,6,7] -; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm4 -; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm4, %xmm26 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero,ymm13[25] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero -; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm4, %ymm25 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm4[23],zero,ymm4[21,22,23,26],zero,ymm4[24],zero,ymm4[28,29,26,27] -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm23 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm4[18,19,20,21],zero,ymm4[19],zero,ymm4[25,26,27,22],zero,ymm4[20],zero -; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm22 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm12 +; AVX512DQ-FCP-NEXT: vporq %xmm9, %xmm12, %xmm30 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero,ymm8[27],zero,ymm8[25] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm14[23,u,u,u],zero,ymm14[26],zero,ymm14[24,u,u,u],zero,ymm14[27],zero +; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm12, %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[20],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm3[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,6] -; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3] -; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm29 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1 +; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm8 +; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm8, %ymm22 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm7[23],zero,ymm7[21,22,23,26],zero,ymm7[24],zero,ymm7[28,29,26,27] +; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm9, %ymm21 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm7[18,19,20,21],zero,ymm7[19],zero,ymm7[25,26,27,22],zero,ymm7[20],zero +; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm7, %ymm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm8[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2 +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm2[23],zero,ymm2[23,24,25,26],zero,ymm2[24],zero,ymm2[30,31] +; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm8[2,3,2,3],zmm7[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6] +; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,2,3,3,2,2,3,3] +; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] +; AVX512DQ-FCP-NEXT: vpandnq %ymm0, %ymm28, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128] +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm7 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm14 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm10 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm0[2,3,2,3],zmm3[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm10 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm3[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm15[8],xmm2[8],xmm15[9],xmm2[9],xmm15[10],xmm2[10],xmm15[11],xmm2[11],xmm15[12],xmm2[12],xmm15[13],xmm2[13],xmm15[14],xmm2[14],xmm15[15],xmm2[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3 +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm2[8],xmm8[9],xmm2[9],xmm8[10],xmm2[10],xmm8[11],xmm2[11],xmm8[12],xmm2[12],xmm8[13],xmm2[13],xmm8[14],xmm2[14],xmm8[15],xmm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16 +; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm26 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm12 +; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm1[8],xmm9[9],xmm1[9],xmm9[10],xmm1[10],xmm9[11],xmm1[11],xmm9[12],xmm1[12],xmm9[13],xmm1[13],xmm9[14],xmm1[14],xmm9[15],xmm1[15] -; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3 -; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm2[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0 -; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] -; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm28, %ymm13 -; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm13 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm8 = zmm8[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm6 = zmm6[0,1,0,1],mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1 -; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm26[0,1,0,1] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm24[2,3,2,3],zmm25[2,3,2,3] -; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm22[2,3,2,3],zmm23[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm12 & (zmm7 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vporq %zmm20, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm0[0,1,0,1],zmm1[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm2 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm1 +; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm9 +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm1 = zmm1[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-FCP-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm2 = zmm2[0,1,0,1],mem[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm30[0,1,0,1] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm22[2,3,2,3],zmm24[2,3,2,3] +; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm20[2,3,2,3],zmm21[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm11 & (zmm5 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vporq %zmm18, %zmm17, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm4)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5)) ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm12 & (zmm5 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm17 ^ (zmm12 & (zmm31 ^ zmm17)) -; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm21 ^ (zmm7 & (zmm16 ^ zmm21)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] +; AVX512DQ-FCP-NEXT: vpandnq %zmm0, %zmm5, %zmm0 +; AVX512DQ-FCP-NEXT: vpandq %zmm5, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm11 & (zmm4 | zmm0) +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm8[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,0,1] +; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,0,1] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm8 = zmm8[2,3,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpandnq %zmm8, %zmm11, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm12 & (zmm8 | zmm4) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm12 +; AVX512DQ-FCP-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm12 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm12 = (mem & ~zmm12) | zmm8 +; AVX512DQ-FCP-NEXT: vpandq %zmm4, %zmm31, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255] +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm4, %zmm4 +; AVX512DQ-FCP-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm4 = (mem & ~zmm4) | zmm8 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm26 ^ (zmm11 & (zmm14 ^ zmm26)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm28) +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] +; AVX512DQ-FCP-NEXT: vpandq %zmm0, %zmm27, %zmm8 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm0 +; AVX512DQ-FCP-NEXT: vpternlogq $206, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm0 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm0 = (mem & ~zmm0) | zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm23 ^ (zmm8 & (zmm10 ^ zmm23)) ; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: # ymm11 = mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11 -; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: # ymm12 = mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (zmm7 & (zmm12 ^ zmm11)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm3, %zmm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm3 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm4 ^ (mem & (zmm30 ^ zmm4)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm16)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (mem & (zmm9 ^ zmm1)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31)) -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload -; AVX512DQ-FCP-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3] -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm1 & mem) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm12)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm18 ^ (mem & (zmm2 ^ zmm18)) -; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6)) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $1448, %rsp # imm = 0x5A8 +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11 +; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: # ymm15 = mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15 +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm11 ^ (zmm8 & (zmm15 ^ zmm11)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm19 ^ (zmm13 & (zmm1 ^ zmm19)) +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm7 ^ (mem & (zmm6 ^ zmm7)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm10)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm14)) +; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload +; AVX512DQ-FCP-NEXT: # zmm5 = zmm5[0,1,2,3],mem[2,3,2,3] +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm15)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2)) +; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (mem & (zmm3 ^ zmm1)) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 384(%rax) +; AVX512DQ-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; ; AVX512BW-LABEL: store_i8_stride7_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %ymm16 +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512BW-NEXT: vpermi2w %zmm8, %zmm8, %zmm0 +; AVX512BW-NEXT: vmovdqa (%r9), %ymm12 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-NEXT: vpshufb %ymm16, %ymm12, %ymm1 +; AVX512BW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vmovdqa64 (%r9), %xmm25 +; AVX512BW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512BW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 +; AVX512BW-NEXT: kmovq %r10, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-NEXT: vpshufb %ymm2, %ymm16, %ymm1 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm18 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm15 ; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm3, %ymm18, %ymm4 -; AVX512BW-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512BW-NEXT: vpshufb %ymm0, %ymm16, %ymm4 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm0, %ymm18, %ymm5 -; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm4 -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %ymm24 -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm23 -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm23[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vpshufb %ymm3, %ymm15, %ymm4 +; AVX512BW-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm22 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm21 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm21[8],xmm22[8],xmm21[9],xmm22[9],xmm21[10],xmm22[10],xmm21[11],xmm22[11],xmm21[12],xmm22[12],xmm21[13],xmm22[13],xmm21[14],xmm22[14],xmm21[15],xmm22[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-NEXT: vpshufb %ymm4, %ymm17, %ymm5 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-NEXT: vpshufb %ymm9, %ymm18, %ymm7 +; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm19 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm13[8],xmm19[8],xmm13[9],xmm19[9],xmm13[10],xmm19[10],xmm13[11],xmm19[11],xmm13[12],xmm19[12],xmm13[13],xmm19[13],xmm13[14],xmm19[14],xmm13[15],xmm19[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm10 +; AVX512BW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 +; AVX512BW-NEXT: kmovq %r10, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512BW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 +; AVX512BW-NEXT: kmovq %r10, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k1} +; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512BW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512BW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm20 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,0,1,1,4,4,5,5] ; AVX512BW-NEXT: movl $676341840, %r10d # imm = 0x28502850 ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm24[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-NEXT: vpshufb %ymm8, %ymm23, %ymm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-NEXT: vpshufb %ymm10, %ymm24, %ymm6 -; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm6 -; AVX512BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 -; AVX512BW-NEXT: kmovq %r10, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm5, %ymm4, %ymm7 -; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm9, %ymm11 -; AVX512BW-NEXT: vpor %ymm7, %ymm11, %ymm11 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512BW-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512BW-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm20 {%k1} = ymm3[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512BW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512BW-NEXT: vpshufb %ymm9, %ymm3, %ymm9 ; AVX512BW-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512BW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512BW-NEXT: vpermw %ymm11, %ymm0, %ymm12 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm9 +; AVX512BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 +; AVX512BW-NEXT: kmovq %r10, %k3 +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k3} +; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512BW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %ymm16 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm16, %ymm20 +; AVX512BW-NEXT: vporq %ymm4, %ymm20, %ymm4 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm16, %ymm16 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512BW-NEXT: vporq %ymm16, %ymm2, %ymm2 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] +; AVX512BW-NEXT: vpermw %ymm4, %ymm6, %ymm16 +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm4 {%k3} +; AVX512BW-NEXT: kmovq %rax, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm2 {%k2} ; AVX512BW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm6 {%k3} -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512BW-NEXT: vpshufb %ymm8, %ymm13, %ymm4 -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512BW-NEXT: vpshufb %ymm10, %ymm15, %ymm8 -; AVX512BW-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm25 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm26 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm4 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm2, %ymm17, %ymm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm3, %ymm19, %ymm3 -; AVX512BW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k3} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm5, %ymm20, %ymm2 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm22, %ymm3 -; AVX512BW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-NEXT: vpermw %zmm14, %zmm3, %zmm3 -; AVX512BW-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} -; AVX512BW-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k3} -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: kmovq %rax, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k2} +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm24 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512BW-NEXT: movl $338170920, %eax # imm = 0x14281428 -; AVX512BW-NEXT: kmovd %eax, %k3 -; AVX512BW-NEXT: vpshufb %ymm3, %ymm15, %ymm2 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-NEXT: vpshufb %xmm31, %xmm5, %xmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm5[0,1,0,1] -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512BW-NEXT: vpshufb %ymm1, %ymm19, %ymm5 -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm29 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-NEXT: vpshufb %xmm7, %xmm21, %xmm21 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[2,3,2,3],zmm21[0,1,0,1] -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm21 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm20[27],zero,zero,zero,zero,ymm20[30],zero,ymm20[28],zero,zero,zero,zero,ymm20[31],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[27],zero,zero,zero,zero,ymm22[30],zero,ymm22[28],zero,zero,zero,zero,ymm22[31],zero,ymm22[29] -; AVX512BW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm28 -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-NEXT: vpermw %zmm14, %zmm2, %zmm2 -; AVX512BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512BW-NEXT: kmovq %rax, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2} -; AVX512BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512BW-NEXT: kmovq %rax, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm21 {%k2} -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm23[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX512BW-NEXT: vpshufb %ymm3, %ymm24, %ymm0 {%k3} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm3, %ymm24, %ymm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512BW-NEXT: vpshufb %ymm24, %ymm23, %ymm23 -; AVX512BW-NEXT: vporq %ymm2, %ymm23, %ymm2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm16[24,25],zero,ymm16[23],zero,ymm16[21,22,23,26],zero,ymm16[24],zero,ymm16[28,29,26,27] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[25],zero,ymm18[23],zero,zero,zero,zero,ymm18[26],zero,ymm18[24],zero,zero,zero,zero -; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-NEXT: vpshufb %ymm1, %ymm18, %ymm1 -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,3,3,4,6,7,7] -; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-NEXT: kmovd %eax, %k4 +; AVX512BW-NEXT: vpshufb %ymm24, %ymm3, %ymm2 {%k4} +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm31, %ymm3, %ymm3 +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512BW-NEXT: vpshufb %ymm20, %ymm5, %ymm4 +; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[2,3,2,3],zmm2[2,3,2,3] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512BW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7] +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512BW-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-NEXT: kmovq %rax, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k2} -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,6,7],zmm1[4,5,6,7] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm2[23],zero,zmm2[23,24,25,26],zero,zmm2[24],zero,zmm2[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61] -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k2} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[4,5,6,7],zmm6[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[4,5,6,7],zmm7[4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] ; AVX512BW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512BW-NEXT: kmovq %rax, %k2 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k2} +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k2} ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] -; AVX512BW-NEXT: vpermw %zmm14, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermw %zmm8, %zmm0, %zmm0 ; AVX512BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 +; AVX512BW-NEXT: kmovq %rax, %k5 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm17[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512BW-NEXT: vpshufb %ymm24, %ymm18, %ymm0 {%k4} +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm29 +; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm29[0],xmm30[0],xmm29[1],xmm30[1],xmm29[2],xmm30[2],xmm29[3],xmm30[3],xmm29[4],xmm30[4],xmm29[5],xmm30[5],xmm29[6],xmm30[6],xmm29[7],xmm30[7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-NEXT: vpshufb %xmm26, %xmm1, %xmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,2,3],zmm1[0,1,0,1] +; AVX512BW-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7] +; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm3 {%k1} +; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm28 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-NEXT: vpshufb %xmm28, %xmm24, %xmm24 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm3[2,3,2,3],zmm24[0,1,0,1] +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm24 {%k3} +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] +; AVX512BW-NEXT: vpor %ymm2, %ymm3, %ymm1 +; AVX512BW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm27, %xmm0, %xmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-NEXT: vpermw %zmm8, %zmm1, %zmm1 +; AVX512BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k3} -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512BW-NEXT: vpshufb %xmm18, %xmm26, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm25, %xmm1 -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm25[0],xmm26[0],xmm25[1],xmm26[1],xmm25[2],xmm26[2],xmm25[3],xmm26[3],xmm25[4],xmm26[4],xmm25[5],xmm26[5],xmm25[6],xmm26[6],xmm25[7],xmm26[7] -; AVX512BW-NEXT: vpshufb %xmm31, %xmm1, %xmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512BW-NEXT: vpshufb %xmm0, %xmm19, %xmm1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512BW-NEXT: vpshufb %xmm26, %xmm17, %xmm25 -; AVX512BW-NEXT: vporq %xmm1, %xmm25, %xmm1 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] -; AVX512BW-NEXT: vpshufb %xmm7, %xmm25, %xmm7 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,0,1],zmm1[0,1,0,1] -; AVX512BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k3} -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512BW-NEXT: vpshufb %xmm1, %xmm20, %xmm7 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512BW-NEXT: vpshufb %xmm25, %xmm22, %xmm31 -; AVX512BW-NEXT: vporq %xmm7, %xmm31, %xmm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm22[0],xmm20[0],xmm22[1],xmm20[1],xmm22[2],xmm20[2],xmm22[3],xmm20[3],xmm22[4],xmm20[4],xmm22[5],xmm20[5],xmm22[6],xmm20[6],xmm22[7],xmm20[7] -; AVX512BW-NEXT: vpshufb %xmm5, %xmm31, %xmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm7[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-NEXT: vpermw %zmm14, %zmm7, %zmm7 -; AVX512BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k3} -; AVX512BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512BW-NEXT: kmovq %rax, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm16 {%k3} -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,1,1,4,4,5,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm15[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512BW-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX512BW-NEXT: vpshufb %ymm24, %ymm13, %ymm7 -; AVX512BW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[2,3,2,3],zmm3[2,3,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm17[18],zero,zmm17[18,19,20,21],zero,zmm17[19],zero,zmm17[25,26,27,22],zero,zmm17[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm17[56,57],zero,zmm17[55],zero,zmm17[53,54,55,58],zero,zmm17[56],zero,zmm17[60,61,58,59] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm19[18],zero,zero,zero,zero,zmm19[21],zero,zmm19[19],zero,zero,zero,zero,zmm19[22],zero,zmm19[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm19[57],zero,zmm19[55],zero,zero,zero,zero,zmm19[58],zero,zmm19[56],zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm5, %zmm7, %zmm5 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm5 {%k2} -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm22[20],zero,zmm22[18],zero,zmm22[20,21,20,21],zero,zmm22[19],zero,zmm22[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm22[56,57,56,57],zero,zmm22[55],zero,zmm22[55,56,57,58],zero,zmm22[56],zero,zmm22[62,63] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[20],zero,zmm20[18],zero,zero,zero,zero,zmm20[21],zero,zmm20[19],zero,zero,zero,zero,zmm20[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm20[57],zero,zmm20[55],zero,zero,zero,zero,zmm20[58],zero,zmm20[56],zero,zero -; AVX512BW-NEXT: vporq %zmm3, %zmm7, %zmm3 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512BW-NEXT: vpermw %zmm14, %zmm7, %zmm7 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm24 {%k3} +; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm17[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm18[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512BW-NEXT: vpshufb %ymm31, %ymm18, %ymm1 +; AVX512BW-NEXT: vpshufb %ymm20, %ymm17, %ymm17 +; AVX512BW-NEXT: vporq %ymm1, %ymm17, %ymm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[25,26,27,22],zero,zmm1[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[56,57],zero,zmm1[55],zero,zmm1[53,54,55,58],zero,zmm1[56],zero,zmm1[60,61,58,59] +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[18],zero,zero,zero,zero,zmm14[21],zero,zmm14[19],zero,zero,zero,zero,zmm14[22],zero,zmm14[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm14[57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero +; AVX512BW-NEXT: vporq %zmm1, %zmm14, %zmm1 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm14 = zmm1[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[20],zero,zmm0[18],zero,zmm0[20,21,20,21],zero,zmm0[19],zero,zmm0[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,56,57],zero,zmm0[55],zero,zmm0[55,56,57,58],zero,zmm0[56],zero,zmm0[62,63] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero +; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] +; AVX512BW-NEXT: vpermw %zmm8, %zmm1, %zmm1 ; AVX512BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm5 {%k1} -; AVX512BW-NEXT: vpshufb %xmm0, %xmm27, %xmm0 -; AVX512BW-NEXT: vpshufb %xmm26, %xmm29, %xmm3 -; AVX512BW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm27[8],xmm29[8],xmm27[9],xmm29[9],xmm27[10],xmm29[10],xmm27[11],xmm29[11],xmm27[12],xmm29[12],xmm27[13],xmm29[13],xmm27[14],xmm29[14],xmm27[15],xmm29[15] -; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512BW-NEXT: vpshufb %xmm18, %xmm10, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm3[0,1,0,1] -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512BW-NEXT: vpshufb %xmm1, %xmm28, %xmm0 -; AVX512BW-NEXT: vpshufb %xmm25, %xmm30, %xmm1 -; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[0,1,0,1] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] -; AVX512BW-NEXT: vpermw %zmm14, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512BW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX512BW-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,0,1],zmm4[0,1,0,1] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm30, %xmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm29, %xmm11 +; AVX512BW-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm11[0,1,0,1] +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm5 {%k2} +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm11 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm15 +; AVX512BW-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm11[0,1,0,1],zmm2[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512BW-NEXT: vpermw %zmm8, %zmm3, %zmm3 ; AVX512BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} ; AVX512BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm5 {%k1} +; AVX512BW-NEXT: vpshufb %xmm0, %xmm21, %xmm0 +; AVX512BW-NEXT: vpshufb %xmm6, %xmm22, %xmm2 +; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-NEXT: vpshufb %xmm28, %xmm2, %xmm2 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm19, %xmm3 +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3],xmm19[4],xmm13[4],xmm19[5],xmm13[5],xmm19[6],xmm13[6],xmm19[7],xmm13[7] +; AVX512BW-NEXT: vpshufb %xmm26, %xmm3, %xmm3 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1] +; AVX512BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpshufb %xmm1, %xmm25, %xmm0 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm23, %xmm1 +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] +; AVX512BW-NEXT: vpshufb %xmm27, %xmm1, %xmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-NEXT: vpermw %zmm8, %zmm1, %zmm1 +; AVX512BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 +; AVX512BW-NEXT: kmovq %rax, %k1 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512BW-NEXT: kmovq %rax, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm16, 384(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm10, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm2 +; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm5 +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 ; AVX512BW-FCP-NEXT: kmovq %r10, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm10 -; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpor %ymm11, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm16, %ymm5 +; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm22, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm23 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm23, %ymm7 +; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm30 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm30, %xmm30 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,1,0,1] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm30, %zmm3 +; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 +; AVX512BW-FCP-NEXT: kmovq %r10, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 +; AVX512BW-FCP-NEXT: kmovq %r10, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm9, %ymm25 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm27 +; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm26 +; AVX512BW-FCP-NEXT: vporq %ymm25, %ymm26, %ymm30 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm27 +; AVX512BW-FCP-NEXT: vporq %ymm9, %ymm27, %ymm9 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm9, %ymm27 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm28 +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm28, %ymm29 +; AVX512BW-FCP-NEXT: vporq %ymm27, %ymm29, %ymm31 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm28, %ymm28 +; AVX512BW-FCP-NEXT: vporq %ymm9, %ymm28, %ymm9 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm31, %zmm9 +; AVX512BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 +; AVX512BW-FCP-NEXT: kmovq %r10, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm30, %zmm9 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm30 +; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm30, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm28 +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm28, %ymm24 +; AVX512BW-FCP-NEXT: vporq %ymm17, %ymm24, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm28, %ymm31 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm30, %ymm30 +; AVX512BW-FCP-NEXT: vporq %ymm31, %ymm30, %ymm30 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,3,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm30, %zmm17, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm30 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm31 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512BW-FCP-NEXT: vpermw %ymm30, %ymm31, %ymm31 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm30 = ymm30[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 ; AVX512BW-FCP-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm30, %zmm17 {%k2} ; AVX512BW-FCP-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm16 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm8 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm8 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm13 -; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm13 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm23, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm1 -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7] -; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm11, %xmm1 -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm14, %xmm1 -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm28 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm19[4,5,6,7],zmm18[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm17[23],zero,zmm17[21,22,23,26],zero,zmm17[24],zero,zmm17[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm17[62],zero,zmm17[60],zero,zero,zero,zero,zmm17[63],zero,zmm17[61],zero +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm18[4,5,6,7],zmm19[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm30 = zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm30[25],zero,zmm30[23],zero,zero,zero,zero,zmm30[26],zero,zmm30[24],zero,zero,zero,zero,zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm30[62],zero,zmm30[60],zero,zero,zero,zero,zmm30[63],zero,zmm30[61],zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm17, %zmm30, %zmm17 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm20[4,5,6,7],zmm21[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm30 = zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm30[23],zero,zero,zero,zero,zmm30[26],zero,zmm30[24],zero,zero,zero,zero,zmm30[27],zero,zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm30[60],zero,zmm30[62,63,62,63],zero,zmm30[61],zero,zmm30[63,60,61] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm21[4,5,6,7],zmm20[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm31 = zmm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm31[23],zero,zero,zero,zero,zmm31[26],zero,zmm31[24],zero,zero,zero,zero,zmm31[27],zero,zmm31[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm31[62],zero,zmm31[60],zero,zero,zero,zero,zmm31[63],zero,zmm31[61],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %zmm30, %zmm31, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm31 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm17[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm17 = zmm30[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} -; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[4,5,6,7],zmm31[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm31[4,5,6,7],zmm15[4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm30 = zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm30[25],zero,zmm30[23],zero,zero,zero,zero,zmm30[26],zero,zmm30[24],zero,zero,zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm30[59],zero,zero,zero,zero,zmm30[62],zero,zmm30[60],zero,zero,zero,zero,zmm30[63],zero,zmm30[61] +; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm30, %zmm0 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm17 {%k3} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb %zmm27, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512BW-FCP-NEXT: vpshufb %zmm29, %zmm20, %zmm20 +; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm20, %zmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm19 +; AVX512BW-FCP-NEXT: vpshufb %zmm25, %zmm19, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpshufb %zmm26, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20 +; AVX512BW-FCP-NEXT: vporq %zmm21, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm25 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,3,2,3,6,7,6,7] ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm16[0],xmm7[1],xmm16[1],xmm7[2],xmm16[2],xmm7[3],xmm16[3],xmm7[4],xmm16[4],xmm7[5],xmm16[5],xmm7[6],xmm16[6],xmm7[7],xmm16[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[28,29,30],zero,ymm9[28],zero,ymm9[30,31,30,31],zero,ymm9[29],zero,ymm9[31,28,29] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm31 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm31[0],xmm3[1],xmm31[1],xmm3[2],xmm31[2],xmm3[3],xmm31[3],xmm3[4],xmm31[4],xmm3[5],xmm31[5],xmm3[6],xmm31[6],xmm3[7],xmm31[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm29 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero -; AVX512BW-FCP-NEXT: vporq %ymm23, %ymm29, %ymm23 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[2,3,2,3],zmm1[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm0, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm28 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29] -; AVX512BW-FCP-NEXT: vporq %ymm1, %ymm28, %ymm1 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm15, %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm24 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm31, %zmm21 +; AVX512BW-FCP-NEXT: vpshufb %zmm28, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm26 +; AVX512BW-FCP-NEXT: vporq %zmm15, %zmm21, %zmm15 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] +; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm15 {%k3} +; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm18 {%k3} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm24[0],xmm26[0],xmm24[1],xmm26[1],xmm24[2],xmm26[2],xmm24[3],xmm26[3],xmm24[4],xmm26[4],xmm24[5],xmm26[5],xmm24[6],xmm26[6],xmm24[7],xmm26[7] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm21, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm22 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[28,29,30],zero,ymm22[28],zero,ymm22[30,31,30,31],zero,ymm22[29],zero,ymm22[31,28,29] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm23[30],zero,ymm23[28],zero,zero,zero,zero,ymm23[31],zero,ymm23[29],zero,zero,zero +; AVX512BW-FCP-NEXT: vporq %ymm22, %ymm23, %ymm22 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[2,3,2,3],zmm21[0,1,0,1] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm0[0],xmm25[1],xmm0[1],xmm25[2],xmm0[2],xmm25[3],xmm0[3],xmm25[4],xmm0[4],xmm25[5],xmm0[5],xmm25[6],xmm0[6],xmm25[7],xmm0[7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm23, %xmm23 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm16[30],zero,ymm16[28],zero,zero,zero,zero,ymm16[31],zero,ymm16[29],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero +; AVX512BW-FCP-NEXT: vporq %ymm16, %ymm14, %ymm14 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm14[2,3,2,3],zmm23[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm22, %xmm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] +; AVX512BW-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[2,3,2,3],zmm22[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm11, %zmm11 ; AVX512BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm10 {%k1} ; AVX512BW-FCP-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[4,5,6,7],zmm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm28[23],zero,zmm28[21,22,23,26],zero,zmm28[24],zero,zmm28[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm0[25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm2[4,5,6,7],zmm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zero,zero,zmm5[27],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm5[60],zero,zmm5[62,63,62,63],zero,zmm5[61],zero,zmm5[63,60,61] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm1[4,5,6,7],zmm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zero,zero,zmm6[27],zero,zmm6[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm6[62],zero,zmm6[60],zero,zero,zero,zero,zmm6[63],zero,zmm6[61],zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm28[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm28 = zmm5[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 -; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[4,5,6,7],zmm6[4,5,6,7] -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7],zmm5[4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[59],zero,zero,zero,zero,zmm5[62],zero,zmm5[60],zero,zero,zero,zero,zmm5[63],zero,zmm5[61] -; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 -; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm28 {%k1} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm28 {%k2} -; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm31, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm3, %xmm5 -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm31[8],xmm3[8],xmm31[9],xmm3[9],xmm31[10],xmm3[10],xmm31[11],xmm3[11],xmm31[12],xmm3[12],xmm31[13],xmm3[13],xmm31[14],xmm3[14],xmm31[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm3 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm16, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm7, %xmm5 -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm16[8],xmm7[8],xmm16[9],xmm7[9],xmm16[10],xmm7[10],xmm16[11],xmm7[11],xmm16[12],xmm7[12],xmm16[13],xmm7[13],xmm16[14],xmm7[14],xmm16[15],xmm7[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm5[0,1,0,1] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm29, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm30, %xmm5 -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm5 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm5[0,1,0,1] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm5, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm14 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm25, %xmm23 +; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm23, %xmm11 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm25[8],xmm0[9],xmm25[9],xmm0[10],xmm25[10],xmm0[11],xmm25[11],xmm0[12],xmm25[12],xmm0[13],xmm25[13],xmm0[14],xmm25[14],xmm0[15],xmm25[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,0,1],zmm0[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm24, %xmm25 +; AVX512BW-FCP-NEXT: vporq %xmm12, %xmm25, %xmm12 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm13[0,1,0,1] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm12 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm19, %xmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm20, %xmm25 +; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm25, %xmm13 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,0,1],zmm19[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm19, %zmm19 ; AVX512BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[18],zero,zmm11[18,19,20,21],zero,zmm11[19],zero,zmm11[25,26,27,22],zero,zmm11[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[56,57],zero,zmm11[55],zero,zmm11[53,54,55,58],zero,zmm11[56],zero,zmm11[60,61,58,59] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm12[18],zero,zero,zero,zero,zmm12[21],zero,zmm12[19],zero,zero,zero,zero,zmm12[22],zero,zmm12[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm12[57],zero,zmm12[55],zero,zero,zero,zero,zmm12[58],zero,zmm12[56],zero,zero,zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm5, %zmm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[21],zero,zmm2[19],zero,zero,zero,zero,zmm2[22],zero,zmm2[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zmm2[58],zero,zmm2[56],zero,zero,zero,zero,zmm2[59],zero,zmm2[57] -; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[20],zero,zmm14[18],zero,zmm14[20,21,20,21],zero,zmm14[19],zero,zmm14[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[56,57,56,57],zero,zmm14[55],zero,zmm14[55,56,57,58],zero,zmm14[56],zero,zmm14[62,63] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[20],zero,zmm13[18],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero -; AVX512BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm13 +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm5, %xmm5 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm10[0,1,0,1] +; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm7, %xmm10 +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1] +; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm6 {%k1} +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm2, %xmm5 +; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride7_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm8 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm8, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm12 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm16, %ymm12, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm11, %ymm2 +; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %xmm25 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %xmm23 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 +; AVX512DQ-BW-NEXT: kmovq %r10, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm16, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm18 +; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm14, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm15 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm18, %ymm4 -; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm16, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm18, %ymm5 -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %ymm24 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm23 -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm23[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm15, %ymm4 +; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm4, %ymm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm21 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm21[8],xmm22[8],xmm21[9],xmm22[9],xmm21[10],xmm22[10],xmm21[11],xmm22[11],xmm21[12],xmm22[12],xmm21[13],xmm22[13],xmm21[14],xmm22[14],xmm21[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm17, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm18 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm18, %ymm7 +; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm13 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm13[8],xmm19[8],xmm13[9],xmm19[9],xmm13[10],xmm19[10],xmm13[11],xmm19[11],xmm13[12],xmm19[12],xmm13[13],xmm19[13],xmm13[14],xmm19[14],xmm13[15],xmm19[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm10 +; AVX512DQ-BW-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 +; AVX512DQ-BW-NEXT: kmovq %r10, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 +; AVX512DQ-BW-NEXT: kmovq %r10, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm0 +; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm0, %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm1, %ymm3 +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm1, %ymm5 +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm5 +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm20 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm20 = ymm20[0,0,1,1,4,4,5,5] ; AVX512DQ-BW-NEXT: movl $676341840, %r10d # imm = 0x28502850 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm24[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm23, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm24, %ymm6 -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 -; AVX512DQ-BW-NEXT: kmovq %r10, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm4, %ymm7 -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm9, %ymm11 -; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm11, %ymm11 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm9, %ymm9 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm4, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm20 {%k1} = ymm3[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm20 = ymm20[2,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm3, %ymm9 ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm9, %ymm4 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rax), %ymm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512DQ-BW-NEXT: vpermw %ymm11, %ymm0, %ymm12 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm4, %zmm9 +; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 +; AVX512DQ-BW-NEXT: kmovq %r10, %k3 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm2 +; AVX512DQ-BW-NEXT: vpshufb %ymm16, %ymm2, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %ymm16 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm16, %ymm20 +; AVX512DQ-BW-NEXT: vporq %ymm4, %ymm20, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm16, %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vporq %ymm16, %ymm2, %ymm2 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm4, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] +; AVX512DQ-BW-NEXT: vpermw %ymm4, %ymm6, %ymm16 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm4 {%k3} +; AVX512DQ-BW-NEXT: kmovq %rax, %k2 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm2 {%k2} ; AVX512DQ-BW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm4, %zmm6 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13 -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm13, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm15 -; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm15, %ymm8 -; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm8, %ymm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm26 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm10 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm12, %zmm4 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm2, %ymm17, %ymm2 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm19, %ymm3 -; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm20, %ymm2 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm22, %ymm3 -; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm22[8],xmm20[8],xmm22[9],xmm20[9],xmm22[10],xmm20[10],xmm22[11],xmm20[11],xmm22[12],xmm20[12],xmm22[13],xmm20[13],xmm22[14],xmm20[14],xmm22[15],xmm20[15] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm9 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm3, %zmm3 -; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} -; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm4 {%k3} -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: kmovq %rax, %k2 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k2} +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] +; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm24 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14] ; AVX512DQ-BW-NEXT: movl $338170920, %eax # imm = 0x14281428 -; AVX512DQ-BW-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm15, %ymm2 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm31 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-NEXT: vpshufb %xmm31, %xmm5, %xmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm5[0,1,0,1] -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm19, %ymm5 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm7 = ymm17[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm27 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm29 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm29[0],xmm27[0],xmm29[1],xmm27[1],xmm29[2],xmm27[2],xmm29[3],xmm27[3],xmm29[4],xmm27[4],xmm29[5],xmm27[5],xmm29[6],xmm27[6],xmm29[7],xmm27[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm21, %xmm21 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm5[2,3,2,3],zmm21[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm20[27],zero,zero,zero,zero,ymm20[30],zero,ymm20[28],zero,zero,zero,zero,ymm20[31],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[27],zero,zero,zero,zero,ymm22[30],zero,ymm22[28],zero,zero,zero,zero,ymm22[31],zero,ymm22[29] -; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm5, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 -; AVX512DQ-BW-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E -; AVX512DQ-BW-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm21 {%k2} -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm23[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm24, %ymm0 {%k3} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm24, %ymm2 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm23, %ymm23 -; AVX512DQ-BW-NEXT: vporq %ymm2, %ymm23, %ymm2 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm0[2,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm16[24,25],zero,ymm16[23],zero,ymm16[21,22,23,26],zero,ymm16[24],zero,ymm16[28,29,26,27] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[25],zero,ymm18[23],zero,zero,zero,zero,ymm18[26],zero,ymm18[24],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-BW-NEXT: vpshufb %ymm1, %ymm18, %ymm1 -; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm2 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,3,3,4,6,7,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: kmovd %eax, %k4 +; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm3, %ymm2 {%k4} +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm31 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512DQ-BW-NEXT: # ymm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm31, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm5, %ymm4 +; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm3[2,3,2,3],zmm2[2,3,2,3] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm3, %ymm2 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12] +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm7 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 ; AVX512DQ-BW-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512DQ-BW-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm1 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,6,7],zmm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm2[23],zero,zmm2[23,24,25,26],zero,zmm2[24],zero,zmm2[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm2[59],zero,zero,zero,zero,zmm2[62],zero,zmm2[60],zero,zero,zero,zero,zmm2[63],zero -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61] -; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k2} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[4,5,6,7],zmm6[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm6[4,5,6,7],zmm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[25],zero,zmm1[23],zero,zero,zero,zero,zmm1[26],zero,zmm1[24],zero,zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[59],zero,zero,zero,zero,zmm1[62],zero,zmm1[60],zero,zero,zero,zero,zmm1[63],zero,zmm1[61] +; AVX512DQ-BW-NEXT: vporq %zmm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512DQ-BW-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k2} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k2} ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] -; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 +; AVX512DQ-BW-NEXT: kmovq %rax, %k5 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5} +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm17[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm18, %ymm0 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm29 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm30 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm29[0],xmm30[0],xmm29[1],xmm30[1],xmm29[2],xmm30[2],xmm29[3],xmm30[3],xmm29[4],xmm30[4],xmm29[5],xmm30[5],xmm29[6],xmm30[6],xmm29[7],xmm30[7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,2,3],zmm1[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm15, %ymm3 +; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm4 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm28 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm24, %xmm24 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm3[2,3,2,3],zmm24[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm24 {%k3} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm12[27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] +; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm3, %ymm1 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm0, %xmm0 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm23 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm26, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm25, %xmm1 -; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm25[0],xmm26[0],xmm25[1],xmm26[1],xmm25[2],xmm26[2],xmm25[3],xmm26[3],xmm25[4],xmm26[4],xmm25[5],xmm26[5],xmm25[6],xmm26[6],xmm25[7],xmm26[7] -; AVX512DQ-BW-NEXT: vpshufb %xmm31, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm19, %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm17, %xmm25 -; AVX512DQ-BW-NEXT: vporq %xmm1, %xmm25, %xmm1 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm25, %xmm7 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,0,1],zmm1[0,1,0,1] -; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm20, %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm22, %xmm31 -; AVX512DQ-BW-NEXT: vporq %xmm7, %xmm31, %xmm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm22[0],xmm20[0],xmm22[1],xmm20[1],xmm22[2],xmm20[2],xmm22[3],xmm20[3],xmm22[4],xmm20[4],xmm22[5],xmm20[5],xmm22[6],xmm20[6],xmm22[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm31, %xmm5 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm7[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm7, %zmm7 -; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 -; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k3} -; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512DQ-BW-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm16 {%k3} -; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,1,1,4,4,5,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm15[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm15, %ymm3 -; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm13, %ymm7 -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm7, %ymm3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm5[2,3,2,3],zmm3[2,3,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm17[18],zero,zmm17[18,19,20,21],zero,zmm17[19],zero,zmm17[25,26,27,22],zero,zmm17[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm17[56,57],zero,zmm17[55],zero,zmm17[53,54,55,58],zero,zmm17[56],zero,zmm17[60,61,58,59] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm19[18],zero,zero,zero,zero,zmm19[21],zero,zmm19[19],zero,zero,zero,zero,zmm19[22],zero,zmm19[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm19[57],zero,zmm19[55],zero,zero,zero,zero,zmm19[58],zero,zmm19[56],zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm5, %zmm7, %zmm5 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm22[20],zero,zmm22[18],zero,zmm22[20,21,20,21],zero,zmm22[19],zero,zmm22[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm22[56,57,56,57],zero,zmm22[55],zero,zmm22[55,56,57,58],zero,zmm22[56],zero,zmm22[62,63] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm7 = zmm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm20[20],zero,zmm20[18],zero,zero,zero,zero,zmm20[21],zero,zmm20[19],zero,zero,zero,zero,zmm20[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm20[57],zero,zmm20[55],zero,zero,zero,zero,zmm20[58],zero,zmm20[56],zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm3 = zmm3[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm7, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm24 {%k3} +; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm17[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm18[u,u,u,u,5,u,3,u,u,u,u,6,u,4,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX512DQ-BW-NEXT: vpshufb %ymm31, %ymm18, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm17, %ymm17 +; AVX512DQ-BW-NEXT: vporq %ymm1, %ymm17, %ymm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm1 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[25,26,27,22],zero,zmm1[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[56,57],zero,zmm1[55],zero,zmm1[53,54,55,58],zero,zmm1[56],zero,zmm1[60,61,58,59] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm14, %zmm14 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm14 = zmm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm14[18],zero,zero,zero,zero,zmm14[21],zero,zmm14[19],zero,zero,zero,zero,zmm14[22],zero,zmm14[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm14[57],zero,zmm14[55],zero,zero,zero,zero,zmm14[58],zero,zmm14[56],zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm14 = zmm1[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2} +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[20],zero,zmm0[18],zero,zmm0[20,21,20,21],zero,zmm0[19],zero,zmm0[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,56,57],zero,zmm0[55],zero,zmm0[55,56,57,58],zero,zmm0[56],zero,zmm0[62,63] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm1[20],zero,zmm1[18],zero,zero,zero,zero,zmm1[21],zero,zmm1[19],zero,zero,zero,zero,zmm1[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero +; AVX512DQ-BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] +; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm27, %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm29, %xmm3 -; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm27[8],xmm29[8],xmm27[9],xmm29[9],xmm27[10],xmm29[10],xmm27[11],xmm29[11],xmm27[12],xmm29[12],xmm27[13],xmm29[13],xmm27[14],xmm29[14],xmm27[15],xmm29[15] -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm10, %xmm3 -; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm8, %xmm2 -; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm3[0,1,0,1] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm28, %xmm0 -; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm30, %xmm1 -; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] -; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm1 -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm1[0,1,0,1] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] -; AVX512DQ-BW-NEXT: vpermw %zmm14, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm4, %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm5, %xmm7 +; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm7, %xmm1 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,0,1],zmm4[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm4 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm30, %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm29, %xmm11 +; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm11, %xmm5 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm11[0,1,0,1] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm5 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm3, %xmm11 +; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm12 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm15 +; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm15, %xmm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm11[0,1,0,1],zmm2[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm3, %zmm3 ; AVX512DQ-BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm21, %xmm0 +; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm22, %xmm2 +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm2, %xmm2 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm13, %xmm2 +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm19, %xmm3 +; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3],xmm19[4],xmm13[4],xmm19[5],xmm13[5],xmm19[6],xmm13[6],xmm19[7],xmm13[7] +; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm3, %xmm3 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1] +; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C +; AVX512DQ-BW-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm25, %xmm0 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm23, %xmm1 +; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] +; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm1, %xmm1 +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 +; AVX512DQ-BW-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 384(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,38,33,38,32,39,33,32,39,46,41,46,40,47,41,40,47] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm10, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %r10 # imm = 0x2040810204081020 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm16, %ymm5 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm22, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm23, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm7, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm30 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm30, %xmm30 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm30, %zmm3 +; AVX512DQ-BW-FCP-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306 ; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm11, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %r10 # imm = 0x3870E1C3870E1C38 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm9, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm27 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm26 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm25, %ymm26, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm26 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm27, %ymm27 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm9, %ymm27, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm30, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm9, %ymm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm28, %ymm29 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm27, %ymm29, %ymm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm28, %ymm28 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm9, %ymm28, %ymm9 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rax), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm31, %zmm9 +; AVX512DQ-BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm30, %zmm9 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm30, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm28 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm28, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm17, %ymm24, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm28, %ymm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm30, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vporq %ymm31, %ymm30, %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm30[2,3,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm30, %zmm17, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm30 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm31 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm30, %ymm31, %ymm31 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm30 = ymm30[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm30 ; AVX512DQ-BW-FCP-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm30, %zmm17 {%k2} ; AVX512DQ-BW-FCP-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm23, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm18[0],xmm23[0],xmm18[1],xmm23[1],xmm18[2],xmm23[2],xmm18[3],xmm23[3],xmm18[4],xmm23[4],xmm18[5],xmm23[5],xmm18[6],xmm23[6],xmm18[7],xmm23[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm11, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm14, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm28 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm19[4,5,6,7],zmm18[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm17 = zmm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm17[23],zero,zmm17[21,22,23,26],zero,zmm17[24],zero,zmm17[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm17[62],zero,zmm17[60],zero,zero,zero,zero,zmm17[63],zero,zmm17[61],zero +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm18[4,5,6,7],zmm19[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm30 = zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm30[25],zero,zmm30[23],zero,zero,zero,zero,zmm30[26],zero,zmm30[24],zero,zero,zero,zero,zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm30[62],zero,zmm30[60],zero,zero,zero,zero,zmm30[63],zero,zmm30[61],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm17, %zmm30, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm20[4,5,6,7],zmm21[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm30 = zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm30[23],zero,zero,zero,zero,zmm30[26],zero,zmm30[24],zero,zero,zero,zero,zmm30[27],zero,zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm30[60],zero,zmm30[62,63,62,63],zero,zmm30[61],zero,zmm30[63,60,61] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 = zmm21[4,5,6,7],zmm20[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm31 = zmm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm31[23],zero,zero,zero,zero,zmm31[26],zero,zmm31[24],zero,zero,zero,zero,zmm31[27],zero,zmm31[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm31[62],zero,zmm31[60],zero,zero,zero,zero,zmm31[63],zero,zmm31[61],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %zmm30, %zmm31, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm17[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm17 = zmm30[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k2} -; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm15[4,5,6,7],zmm31[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm31[4,5,6,7],zmm15[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm30 = zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm30[25],zero,zmm30[23],zero,zero,zero,zero,zmm30[26],zero,zmm30[24],zero,zero,zmm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm30[59],zero,zero,zero,zero,zmm30[62],zero,zmm30[60],zero,zero,zero,zero,zmm30[63],zero,zmm30[61] +; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm30, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm17 {%k3} +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm21, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm27, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm20, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm29, %zmm20, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm20, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm25, %zmm19, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm26, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vporq %zmm21, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm25 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,3,2,3,6,7,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm16[0],xmm7[1],xmm16[1],xmm7[2],xmm16[2],xmm7[3],xmm16[3],xmm7[4],xmm16[4],xmm7[5],xmm16[5],xmm7[6],xmm16[6],xmm7[7],xmm16[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[28,29,30],zero,ymm9[28],zero,ymm9[30,31,30,31],zero,ymm9[29],zero,ymm9[31,28,29] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero,ymm10[29],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm31[0],xmm3[1],xmm31[1],xmm3[2],xmm31[2],xmm3[3],xmm31[3],xmm3[4],xmm31[4],xmm3[5],xmm31[5],xmm3[6],xmm31[6],xmm3[7],xmm31[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm12[30],zero,ymm12[28],zero,zero,zero,zero,ymm12[31],zero,ymm12[29],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm29 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %ymm23, %ymm29, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm23[2,3,2,3],zmm1[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm0, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm13[27],zero,zero,zero,zero,ymm13[30],zero,ymm13[28],zero,zero,zero,zero,ymm13[31],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm28 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[27],zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29] -; AVX512DQ-BW-FCP-NEXT: vporq %ymm1, %ymm28, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm15, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm31, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm28, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm26 +; AVX512DQ-BW-FCP-NEXT: vporq %zmm15, %zmm21, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm15 = zmm15[2,3,2,3,6,7,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm15 {%k3} +; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm18 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm24[0],xmm26[0],xmm24[1],xmm26[1],xmm24[2],xmm26[2],xmm24[3],xmm26[3],xmm24[4],xmm26[4],xmm24[5],xmm26[5],xmm24[6],xmm26[6],xmm24[7],xmm26[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm22 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm22[28,29,30],zero,ymm22[28],zero,ymm22[30,31,30,31],zero,ymm22[29],zero,ymm22[31,28,29] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm23[30],zero,ymm23[28],zero,zero,zero,zero,ymm23[31],zero,ymm23[29],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %ymm22, %ymm23, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm22[2,3,2,3],zmm21[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm25[0],xmm0[0],xmm25[1],xmm0[1],xmm25[2],xmm0[2],xmm25[3],xmm0[3],xmm25[4],xmm0[4],xmm25[5],xmm0[5],xmm25[6],xmm0[6],xmm25[7],xmm0[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm23, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,zero,zero,ymm16[30],zero,ymm16[28],zero,zero,zero,zero,ymm16[31],zero,ymm16[29],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[30],zero,ymm14[28],zero,zero,zero,zero,ymm14[31],zero,ymm14[29],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %ymm16, %ymm14, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 = zmm14[2,3,2,3],zmm23[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm22, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm10[27],zero,zero,zero,zero,ymm10[30],zero,ymm10[28],zero,zero,zero,zero,ymm10[31],zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[27],zero,zero,zero,zero,ymm11[30],zero,ymm11[28],zero,zero,zero,zero,ymm11[31],zero,ymm11[29] +; AVX512DQ-BW-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[2,3,2,3],zmm22[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm11, %zmm11 ; AVX512DQ-BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm11, %zmm10 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm23 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm0[4,5,6,7],zmm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7],zmm0[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,zmm28[23],zero,zmm28[21,22,23,26],zero,zmm28[24],zero,zmm28[28,29,26,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,59],zero,zero,zero,zero,zmm28[62],zero,zmm28[60],zero,zero,zero,zero,zmm28[63],zero,zmm28[61],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm0[25],zero,zmm0[23],zero,zero,zero,zero,zmm0[26],zero,zmm0[24],zero,zero,zero,zero,zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero,zmm0[61],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm0, %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm2[4,5,6,7],zmm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zero,zero,zmm5[27],zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,60,61,62],zero,zmm5[60],zero,zmm5[62,63,62,63],zero,zmm5[61],zero,zmm5[63,60,61] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm1[4,5,6,7],zmm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm6[23],zero,zero,zero,zero,zmm6[26],zero,zmm6[24],zero,zero,zero,zero,zmm6[27],zero,zmm6[25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zmm6[62],zero,zmm6[60],zero,zero,zero,zero,zmm6[63],zero,zmm6[61],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm5, %zmm6, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm6 = zmm28[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm28 = zmm5[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[4,5,6,7],zmm6[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7],zmm5[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,zmm0[23],zero,zmm0[23,24,25,26],zero,zmm0[24],zero,zmm0[30,31,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,61],zero,zmm0[59],zero,zero,zero,zero,zmm0[62],zero,zmm0[60],zero,zero,zero,zero,zmm0[63],zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm5[25],zero,zmm5[23],zero,zero,zero,zero,zmm5[26],zero,zmm5[24],zero,zero,zmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm5[59],zero,zero,zero,zero,zmm5[62],zero,zmm5[60],zero,zero,zero,zero,zmm5[63],zero,zmm5[61] -; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm28 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [27,29,28,27,28,29,29,28,27,29,28,27,28,29,29,28,30,29,30,31,31,30,30,31,30,29,30,31,31,30,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm28 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm31, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm3, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm31[8],xmm3[8],xmm31[9],xmm3[9],xmm31[10],xmm3[10],xmm31[11],xmm3[11],xmm31[12],xmm3[12],xmm31[13],xmm3[13],xmm31[14],xmm3[14],xmm31[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm3[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm16, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm7, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm16[8],xmm7[8],xmm16[9],xmm7[9],xmm16[10],xmm7[10],xmm16[11],xmm7[11],xmm16[12],xmm7[12],xmm16[13],xmm7[13],xmm16[14],xmm7[14],xmm16[15],xmm7[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,0,1],zmm5[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm29, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm30, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1],zmm5[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm5, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm25, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm23, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm25[8],xmm0[9],xmm25[9],xmm0[10],xmm25[10],xmm0[11],xmm25[11],xmm0[12],xmm25[12],xmm0[13],xmm25[13],xmm0[14],xmm25[14],xmm0[15],xmm25[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm11[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm26, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm23 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm24, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm12, %xmm25, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm24, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,0,1],zmm13[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm19, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm20, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm25, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10] +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,0,1],zmm19[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm19 = [18,18,18,20,18,18,18,20,19,19,19,19,18,18,18,20,20,21,20,21,21,22,21,22,20,21,20,21,21,22,21,22] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm19, %zmm19 ; AVX512DQ-BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[18],zero,zmm11[18,19,20,21],zero,zmm11[19],zero,zmm11[25,26,27,22],zero,zmm11[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm11[56,57],zero,zmm11[55],zero,zmm11[53,54,55,58],zero,zmm11[56],zero,zmm11[60,61,58,59] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm12[18],zero,zero,zero,zero,zmm12[21],zero,zmm12[19],zero,zero,zero,zero,zmm12[22],zero,zmm12[20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zmm12[57],zero,zmm12[55],zero,zero,zero,zero,zmm12[58],zero,zmm12[56],zero,zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm5, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[18,19,20,21],zero,zmm1[19],zero,zmm1[21,20,21,22],zero,zmm1[20],zero,zmm1[22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,57],zero,zmm1[55],zero,zero,zero,zero,zmm1[58],zero,zmm1[56],zero,zero,zero,zero,zmm1[59],zero -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[21],zero,zmm2[19],zero,zero,zero,zero,zmm2[22],zero,zmm2[20],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zmm2[58],zero,zmm2[56],zero,zero,zero,zero,zmm2[59],zero,zmm2[57] -; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[20],zero,zmm14[18],zero,zmm14[20,21,20,21],zero,zmm14[19],zero,zmm14[19,20,21,22],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm14[56,57,56,57],zero,zmm14[55],zero,zmm14[55,56,57,58],zero,zmm14[56],zero,zmm14[62,63] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zmm13[20],zero,zmm13[18],zero,zero,zero,zero,zmm13[21],zero,zmm13[19],zero,zero,zero,zero,zmm13[22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm13[57],zero,zmm13[55],zero,zero,zero,zero,zmm13[58],zero,zmm13[56],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm13, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm5, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm10[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm7, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm6 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm2, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870 +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index f434fc8c6cad8..ec4aee92b31f7 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -373,10 +373,11 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movaps 8(%ebp), %xmm3 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X86-SSE2-NEXT: pslld $16, %xmm0 ; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index f80544fdef7e6..82945829c491e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -371,10 +371,11 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movaps 8(%ebp), %xmm3 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X86-SSE2-NEXT: pslld $16, %xmm0 ; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 @@ -1142,18 +1143,19 @@ define i1 @icmp0_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: packssdw %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm5, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 @@ -1856,9 +1858,10 @@ define i1 @icmp_v8i32_v8i1(<8 x i32>, <8 x i32>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm1 -; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testl %eax, %eax ; X86-SSE2-NEXT: setne %al @@ -2099,23 +2102,27 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>, <8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqd 72(%ebp), %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pcmpeqd 56(%ebp), %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: packssdw %xmm4, %xmm3 -; X86-SSE2-NEXT: pcmpeqd 40(%ebp), %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpeqd 24(%ebp), %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: packssdw %xmm2, %xmm1 -; X86-SSE2-NEXT: packssdw %xmm3, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm3 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm7, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm6, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testl $43690, %eax # imm = 0xAAAA ; X86-SSE2-NEXT: setne %al ; X86-SSE2-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index 320dce840ea57..4c256774a6354 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -519,10 +519,11 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movaps 8(%ebp), %xmm3 ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X86-SSE2-NEXT: pslld $16, %xmm0 ; X86-SSE2-NEXT: psrad $16, %xmm0 -; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],mem[0,2] +; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] ; X86-SSE2-NEXT: pslld $16, %xmm2 ; X86-SSE2-NEXT: psrad $16, %xmm2 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 @@ -1555,18 +1556,19 @@ define i1 @icmp0_v8i64_v8i1(<8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: pxor %xmm3, %xmm3 -; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm1, %xmm4 -; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm0 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: pxor %xmm4, %xmm4 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm1 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm1, %xmm5 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm0 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: packssdw %xmm4, %xmm1 -; X86-SSE2-NEXT: pcmpeqd %xmm3, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm5, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm2 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm3 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,0,3,2] ; X86-SSE2-NEXT: pand %xmm3, %xmm2 ; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 @@ -2367,9 +2369,10 @@ define i1 @icmp_v8i32_v8i1(<8 x i32>, <8 x i32>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; X86-SSE2-NEXT: pcmpeqd 8(%ebp), %xmm1 -; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm3 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm0 ; X86-SSE2-NEXT: packsswb %xmm0, %xmm0 ; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testb %al, %al @@ -2647,24 +2650,28 @@ define i1 @icmp_v8i64_v8i1(<8 x i64>, <8 x i64>) nounwind { ; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: andl $-16, %esp ; X86-SSE2-NEXT: subl $16, %esp -; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 -; X86-SSE2-NEXT: pcmpeqd 72(%ebp), %xmm3 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm3, %xmm4 -; X86-SSE2-NEXT: pcmpeqd 56(%ebp), %xmm2 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm2, %xmm3 -; X86-SSE2-NEXT: packssdw %xmm4, %xmm3 -; X86-SSE2-NEXT: pcmpeqd 40(%ebp), %xmm1 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm1, %xmm2 -; X86-SSE2-NEXT: pcmpeqd 24(%ebp), %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] -; X86-SSE2-NEXT: pand %xmm0, %xmm1 -; X86-SSE2-NEXT: packssdw %xmm2, %xmm1 -; X86-SSE2-NEXT: packssdw %xmm3, %xmm1 -; X86-SSE2-NEXT: packsswb %xmm1, %xmm1 -; X86-SSE2-NEXT: pmovmskb %xmm1, %eax +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm3 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm4 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm6 +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm7 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm7 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm7, %xmm6 +; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm5 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: packssdw %xmm6, %xmm2 +; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm4 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pcmpeqd %xmm0, %xmm3 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,0,3,2] +; X86-SSE2-NEXT: pand %xmm3, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm1, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: packsswb %xmm0, %xmm0 +; X86-SSE2-NEXT: pmovmskb %xmm0, %eax ; X86-SSE2-NEXT: testb %al, %al ; X86-SSE2-NEXT: setnp %al ; X86-SSE2-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index efa6c16fbf4eb..4ae8f5faeccbf 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3051,17 +3051,18 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: pextrw $7, %xmm1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movsbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movsbl (%rdx), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movsbl (%rdx), %ecx +; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movsbl (%rsi), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_scalar_to_vector_extract: @@ -3069,21 +3070,23 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: movsbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSSE3-NEXT: movsbl (%rdx), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSSE3-NEXT: movsbl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: palignr {{.*#+}} xmm3 = xmm1[14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_scalar_to_vector_extract: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 ; SSE41-NEXT: pextrw $4, %xmm0, %eax ; SSE41-NEXT: pextrw $7, %xmm0, %ecx ; SSE41-NEXT: pxor %xmm0, %xmm0 @@ -3099,7 +3102,8 @@ define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { ; ; AVX-LABEL: shuffle_scalar_to_vector_extract: ; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 ; AVX-NEXT: vpextrw $4, %xmm0, %eax ; AVX-NEXT: vpextrw $7, %xmm0, %ecx ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 @@ -3572,45 +3576,53 @@ define void @SpinningCube() { ; SSE2-LABEL: SpinningCube: ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE2-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSE2-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: xorps %xmm3, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSE2-NEXT: addps %xmm3, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: addps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSE2-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSE2-NEXT: addps %xmm0, %xmm3 +; SSE2-NEXT: movaps %xmm3, (%rax) +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rax) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: SpinningCube: ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] -; SSSE3-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u] -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSSE3-NEXT: xorps %xmm3, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] -; SSSE3-NEXT: addps %xmm3, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] -; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: addps %xmm0, %xmm1 -; SSSE3-NEXT: movaps %xmm1, (%rax) +; SSSE3-NEXT: xorps %xmm0, %xmm0 +; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] +; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0] +; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u] +; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] +; SSSE3-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero +; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0] +; SSSE3-NEXT: addps %xmm0, %xmm3 +; SSSE3-NEXT: movaps %xmm3, (%rax) +; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: addps %xmm2, %xmm0 +; SSSE3-NEXT: movaps %xmm0, (%rax) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: SpinningCube: ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; SSE41-NEXT: movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0] ; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] ; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; SSE41-NEXT: movaps %xmm1, %xmm3 @@ -3629,7 +3641,7 @@ define void @SpinningCube() { ; AVX-LABEL: SpinningCube: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0] ; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u] ; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll index 497f71aea2227..d05d0d852c661 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll @@ -50,88 +50,92 @@ define <64 x i8> @f1(ptr %p0) { ; ; AVX512F-LABEL: f1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] ; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (mem & (ymm2 ^ ymm0)) -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm4 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512F-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm4 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX512F-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm2 & mem) +; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512F-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm4 ; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,5,7,11,13,17,19,23,25,29,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm1 & mem) -; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (mem & (ymm1 ^ ymm2)) +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 -; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] -; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: movl $2047, %eax # imm = 0x7FF -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX512BW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512BW-NEXT: movl $4192256, %eax # imm = 0x3FF800 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 {%k1} +; AVX512BW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 {%k1} ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX512BW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512BW-NEXT: vpor %xmm1, %xmm6, %xmm1 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512BW-NEXT: movl $2047, %eax # imm = 0x7FF +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -193,83 +197,91 @@ define <64 x i8> @f2(ptr %p0) { ; ; AVX512F-LABEL: f2: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] -; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ymm4) +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] +; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm4) +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1)) +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] -; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: movl $2095104, %eax # imm = 0x1FF800 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 {%k1} +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512BW-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: movl $-2097152, %eax # imm = 0xFFE00000 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, 128(%rdi), %zmm2, %zmm2 -; AVX512BW-NEXT: movabsq $8998403163813888, %rax # imm = 0x1FF800001FF800 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,35,37,41,43,47,49,53,55,59,61,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 {%k1} +; AVX512BW-NEXT: movl $2047, %eax # imm = 0x7FF +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI-LABEL: f2: @@ -331,20 +343,20 @@ define <64 x i8> @f3(ptr %p0) { ; ; AVX512F-LABEL: f3: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2 +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] +; AVX512F-NEXT: vpshufb %xmm1, %xmm3, %xmm3 +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] -; AVX512F-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpmovsxwd {{.*#+}} ymm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm5 & (ymm0 ^ ymm2)) +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm5 & (ymm2 ^ ymm3)) ; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm6 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 @@ -352,70 +364,71 @@ define <64 x i8> @f3(ptr %p0) { ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] ; AVX512F-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX512F-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm9, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX512F-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm5 & (ymm2 ^ ymm1)) -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: vmovdqa (%rdi), %xmm3 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX512F-NEXT: vpor %xmm6, %xmm3, %xmm3 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm6 +; AVX512F-NEXT: vpshufb %ymm4, %ymm6, %ymm4 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512F-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 +; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %ymm0, %ymm5, %ymm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5) +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] +; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm5 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14] +; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX512BW-NEXT: vpor %xmm2, %xmm5, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14] ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5 ; AVX512BW-NEXT: movl $-2097152, %eax # imm = 0xFFE00000 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1} -; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm4 +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm5 {%k1} +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm4 +; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX512BW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1} -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm1 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI-LABEL: f3: @@ -476,83 +489,91 @@ define <64 x i8> @f4(ptr %p0) { ; ; AVX512F-LABEL: f4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] -; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ymm4) +; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm0 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] +; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] +; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512F-NEXT: vpor %xmm0, %xmm6, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1)) +; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm2 +; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512F-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX512F-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm4 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & mem) -; AVX512F-NEXT: vmovdqa 96(%rdi), %xmm0 -; AVX512F-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0 -; AVX512F-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512F-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] -; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2)) +; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX512F-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm4) +; AVX512F-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1)) +; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: f4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] -; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14] +; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX512BW-NEXT: movl $2095104, %eax # imm = 0x1FF800 +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpshufb %ymm4, %ymm1, %ymm0 {%k1} +; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm1 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm6 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128] +; AVX512BW-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX512BW-NEXT: vpor %xmm1, %xmm6, %xmm1 +; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX512BW-NEXT: movl $-2097152, %eax # imm = 0xFFE00000 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2} +; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vinserti64x4 $1, 128(%rdi), %zmm2, %zmm2 -; AVX512BW-NEXT: movabsq $8998403163813888, %rax # imm = 0x1FF800001FF800 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,34,36,40,42,46,48,52,54,58,60,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2 -; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1 -; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[4,5,6,7] -; AVX512BW-NEXT: movabsq $8796090925056, %rax # imm = 0x7FFFFE00000 -; AVX512BW-NEXT: kmovq %rax, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm3 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3 +; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm2 {%k1} +; AVX512BW-NEXT: movl $2047, %eax # imm = 0x7FF +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VBMI-LABEL: f4: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll index 8f78438dedf92..574762620cbe8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -47,7 +47,7 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0, ; ALL-NEXT: andl $3, %edx ; ALL-NEXT: andl $3, %esi ; ALL-NEXT: vmovaps %ymm0, (%rsp) -; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; ALL-NEXT: movq %rbp, %rsp diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index da8a3f3fa0d4e..3040240cdbea8 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -3030,32 +3030,33 @@ define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" ; ; AVX1-LABEL: trunc_packus_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8: @@ -3317,33 +3318,34 @@ define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-wi ; ; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 ; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 +; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpackusdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index d0cdbf1e3f08d..f15e989485a94 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -2769,32 +2769,33 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i64_v8i8: @@ -3062,33 +3063,34 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt ; ; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [127,127] +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpackssdw %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_ssat_v8i64_v8i8_store: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index a5d83a86f295e..76aff906455f4 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -2172,32 +2172,33 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(ptr %p0) { ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm2, %xmm8, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_usat_v8i64_v8i8: @@ -2354,33 +2355,34 @@ define void @trunc_usat_v8i64_v8i8_store(ptr %p0, ptr%p1) { ; ; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovdqa (%rdi), %ymm0 +; AVX1-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: # xmm4 = mem[0,0] -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: # xmm6 = mem[0,0] -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8 +; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: # xmm4 = mem[0,0] +; AVX1-NEXT: vblendvpd %xmm2, %xmm8, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, (%rsi) +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: trunc_usat_v8i64_v8i8_store: diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 7cddebdca5cca..92fc4fe0b6b1c 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -982,38 +982,38 @@ ret void define void @interleaved_store_vf32_i8_stride3(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf32_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 -; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vmovdqu %xmm5, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm3, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm4, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm4, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm1, 48(%rdi) ; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm1, (%rdi) +; AVX1-NEXT: vmovdqu %xmm3, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm5, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1073,109 +1073,103 @@ ret void define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, ptr %p) nounwind { ; AVX1-LABEL: interleaved_store_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-NEXT: vmovdqa %ymm2, %ymm4 -; AVX1-NEXT: vmovdqa %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9 +; AVX1-NEXT: vmovdqa %ymm5, %ymm10 +; AVX1-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm6 +; AVX1-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm11 +; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm9 +; AVX1-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm10, %xmm9, %xmm9 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 +; AVX1-NEXT: vpshufb %xmm7, %xmm12, %xmm10 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm6 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm7 -; AVX1-NEXT: vpor %xmm6, %xmm7, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm7 -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm8 -; AVX1-NEXT: vpor %xmm7, %xmm8, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm15 -; AVX1-NEXT: vpshufb %xmm14, %xmm15, %xmm10 -; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [11,12,13,14,15,0,1,2,3,4,5,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm9, %xmm10 -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm5 -; AVX1-NEXT: vpshufb %xmm0, %xmm12, %xmm12 -; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm2 -; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm14 +; AVX1-NEXT: vpshufb %xmm12, %xmm14, %xmm15 +; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm5 +; AVX1-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm13, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm12, %xmm3, %xmm15 +; AVX1-NEXT: vpor %xmm1, %xmm15, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm15 +; AVX1-NEXT: vpshufb %xmm12, %xmm15, %xmm7 +; AVX1-NEXT: vpor %xmm7, %xmm11, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm9 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm8 -; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15] -; AVX1-NEXT: vpshufb %xmm2, %xmm13, %xmm2 -; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm13, %xmm12, %xmm13 -; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm5, %xmm11, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm11 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4] -; AVX1-NEXT: vpor %xmm0, %xmm15, %xmm15 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] -; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm12, %xmm2, %xmm7 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm7 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] +; AVX1-NEXT: vpshufb %xmm13, %xmm0, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm0 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm5 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm12 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] +; AVX1-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX1-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4] +; AVX1-NEXT: vpor %xmm13, %xmm8, %xmm13 +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] ; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] -; AVX1-NEXT: vpshufb %xmm10, %xmm6, %xmm12 -; AVX1-NEXT: vpor %xmm12, %xmm8, %xmm8 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb %xmm10, %xmm9, %xmm9 -; AVX1-NEXT: vpor %xmm9, %xmm12, %xmm9 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpshufb %xmm10, %xmm14, %xmm14 -; AVX1-NEXT: vpor %xmm14, %xmm12, %xmm12 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm10 -; AVX1-NEXT: vpshufb %xmm7, %xmm15, %xmm6 -; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm11 -; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqu %xmm6, 80(%rdi) -; AVX1-NEXT: vmovdqu %xmm9, 64(%rdi) -; AVX1-NEXT: vmovdqu %xmm8, 16(%rdi) -; AVX1-NEXT: vmovdqu %xmm4, (%rdi) -; AVX1-NEXT: vmovdqu %xmm10, 48(%rdi) -; AVX1-NEXT: vmovdqu %xmm0, 32(%rdi) -; AVX1-NEXT: vmovdqu %xmm2, 176(%rdi) -; AVX1-NEXT: vmovdqu %xmm1, 160(%rdi) -; AVX1-NEXT: vmovdqu %xmm12, 112(%rdi) +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [5,128,11,6,128,12,7,128,13,8,128,14,9,128,15,10] +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,5,128,128,6,128,128,7,128,128,8,128,128,9,128,128] +; AVX1-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm7, %xmm4 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vpshufb %xmm9, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX1-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm11 +; AVX1-NEXT: vpor %xmm7, %xmm11, %xmm7 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX1-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm9, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm9, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm11 +; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm9, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm9, %xmm14, %xmm12 +; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm9 +; AVX1-NEXT: vmovdqu %xmm5, 80(%rdi) +; AVX1-NEXT: vmovdqu %xmm0, 64(%rdi) +; AVX1-NEXT: vmovdqu %xmm4, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm11, 48(%rdi) +; AVX1-NEXT: vmovdqu %xmm9, 176(%rdi) +; AVX1-NEXT: vmovdqu %xmm8, 160(%rdi) +; AVX1-NEXT: vmovdqu %xmm7, 112(%rdi) +; AVX1-NEXT: vmovdqu %xmm12, 144(%rdi) +; AVX1-NEXT: vmovdqu %xmm2, (%rdi) +; AVX1-NEXT: vmovdqu %xmm6, 32(%rdi) ; AVX1-NEXT: vmovdqu %xmm3, 96(%rdi) -; AVX1-NEXT: vmovdqu %xmm11, 144(%rdi) -; AVX1-NEXT: vmovdqu %xmm5, 128(%rdi) -; AVX1-NEXT: addq $24, %rsp +; AVX1-NEXT: vmovdqu %xmm1, 128(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1274,116 +1268,117 @@ ret void define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm11 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 -; AVX1-NEXT: vmovups 64(%rdi), %xmm0 -; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 -; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 -; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 -; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 -; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 -; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9 +; AVX1-NEXT: vmovdqu (%rdi), %xmm9 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm7 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm11 +; AVX1-NEXT: vmovdqu 96(%rdi), %xmm6 +; AVX1-NEXT: vmovdqu 112(%rdi), %xmm3 +; AVX1-NEXT: vmovdqu 128(%rdi), %xmm4 +; AVX1-NEXT: vmovdqu 144(%rdi), %xmm12 +; AVX1-NEXT: vmovdqu 160(%rdi), %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm2 +; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm5 +; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm8, %xmm4, %xmm2 +; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm8 +; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] -; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX1-NEXT: vpshufb %xmm14, %xmm12, %xmm8 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] -; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, %xmm5 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm10 +; AVX1-NEXT: vpor %xmm8, %xmm10, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10 -; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12 -; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0 +; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm6 +; AVX1-NEXT: vmovdqa %xmm3, %xmm1 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm10 +; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu 64(%rdi), %xmm6 +; AVX1-NEXT: vpshufb %xmm13, %xmm12, %xmm12 +; AVX1-NEXT: vpshufb %xmm13, %xmm11, %xmm13 ; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-NEXT: vmovdqa %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12 -; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13 -; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11 -; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13 -; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5 -; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13 -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10 -; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13 -; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0 -; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12 -; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3 -; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12 -; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14 -; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15 -; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12 -; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12 -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15 -; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15 -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm15, %xmm6, %xmm8 +; AVX1-NEXT: vpor %xmm11, %xmm8, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm14, %xmm9, %xmm11 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm9 +; AVX1-NEXT: vpshufb %xmm15, %xmm9, %xmm10 +; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqu 80(%rdi), %xmm11 +; AVX1-NEXT: vpshufb %xmm14, %xmm6, %xmm0 +; AVX1-NEXT: vpshufb %xmm15, %xmm11, %xmm2 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm3 +; AVX1-NEXT: vmovdqu 176(%rdi), %xmm2 +; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm0 +; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm8 +; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX1-NEXT: vpshufb %xmm14, %xmm9, %xmm8 +; AVX1-NEXT: vpshufb %xmm15, %xmm7, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm8 +; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm14 +; AVX1-NEXT: vpor %xmm8, %xmm14, %xmm15 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm8 +; AVX1-NEXT: vpor %xmm8, %xmm12, %xmm14 +; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm1, %xmm11, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm13, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] +; AVX1-NEXT: vpshufb %xmm8, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] +; AVX1-NEXT: vpshufb %xmm13, %xmm7, %xmm3 +; AVX1-NEXT: vpor %xmm3, %xmm10, %xmm3 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm7, %xmm10 +; AVX1-NEXT: vpshufb %xmm13, %xmm11, %xmm11 +; AVX1-NEXT: vpor %xmm11, %xmm10, %xmm10 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm11 +; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4 +; AVX1-NEXT: vpor %xmm4, %xmm11, %xmm4 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm13, %xmm2, %xmm2 ; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14] -; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13 -; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9 -; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11 -; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] -; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm8, %xmm11, %xmm11 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3 -; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm13, %xmm9, %xmm9 +; AVX1-NEXT: vpor %xmm9, %xmm11, %xmm9 ; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3 -; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 +; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 +; AVX1-NEXT: vpshufb %xmm8, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm10, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm5[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm13, %xmm6, %xmm6 +; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm8, %xmm14, %xmm4 +; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-NEXT: vpshufb %xmm13, %xmm5, %xmm5 +; AVX1-NEXT: vpor %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm12, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_load_vf64_i8_stride3: diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index cacc43e96b6ea..05aa326c8346c 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -6787,8 +6787,8 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b ; ; AVX512BW-LABEL: vec512_v32i16_to_v4i128_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [32,1,2,3,4,5,6,7,33,9,10,11,12,13,14,15,34,1,2,3,4,5,6,7,35,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm1, %zmm2 @@ -7056,23 +7056,23 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi ; ; AVX-LABEL: vec512_v16i32_to_v8i64_factor2: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm0 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpaddb (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vpaddb (%rdx), %xmm4, %xmm3 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm3, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2: diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 572ed314ab31d..e956a97fae919 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -967,9 +967,9 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; ; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm0 ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7] -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb (%rdi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) @@ -979,8 +979,8 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512F-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512F-NEXT: vpaddb (%rdi), %ymm1, %ymm1 ; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rcx) @@ -990,8 +990,8 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in. ; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7] -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vpaddb (%rdi), %ymm1, %ymm1 ; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx) @@ -1634,8 +1634,8 @@ define void @vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i32_factor2_broadcast_to_v8i32_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1752,8 +1752,8 @@ define void @vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i16_widen_to_i64_factor4_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,0,21,22,23,0,25,26,27,0,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -1867,8 +1867,8 @@ define void @vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-LABEL: vec256_i16_widen_to_i128_factor8_broadcast_to_v2i128_factor2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,18,19,20,21,22,23,0,25,26,27,28,29,30,31] ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -2025,9 +2025,9 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2177,9 +2177,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -2329,9 +2329,9 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i ; ; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7] -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -3273,10 +3273,10 @@ define void @vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i8_widen_to_i128_factor16_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm0 +; AVX-NEXT: vmovdqa 48(%rsi), %xmm1 +; AVX-NEXT: vpaddb 48(%rdi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rdi), %xmm0, %xmm0 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551615] ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -3664,9 +3664,9 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -3679,12 +3679,14 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i ; ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47] +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -3866,9 +3868,9 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -3882,12 +3884,14 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0] +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4087,9 +4091,9 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -4103,12 +4107,14 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. ; ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47] +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4291,9 +4297,9 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0 @@ -4307,12 +4313,14 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512BW-FAST: # %bb.0: -; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47,48,49,0,51,52,53,54,55,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 -; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47] +; AVX512BW-FAST-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper ; AVX512BW-FAST-NEXT: retq @@ -4441,9 +4449,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -5112,17 +5120,32 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31] -; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW-SLOW: # %bb.0: +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,21,22,23] +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-SLOW-NEXT: vzeroupper +; AVX512BW-SLOW-NEXT: retq +; +; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: +; AVX512BW-FAST: # %bb.0: +; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0 +; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,28,29,30,31] +; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) +; AVX512BW-FAST-NEXT: vzeroupper +; AVX512BW-FAST-NEXT: retq %in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64 %in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64 %in.vec = add <64 x i8> %in.vec.base, %in.vec.bias @@ -5376,12 +5399,14 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX512BW-SLOW: # %bb.0: -; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 -; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0] -; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0 +; AVX512BW-SLOW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,11] +; AVX512BW-SLOW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 +; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-SLOW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 +; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 +; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-SLOW-NEXT: vzeroupper ; AVX512BW-SLOW-NEXT: retq @@ -5648,8 +5673,8 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in ; ; AVX512BW-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[0],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[16],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[32],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero,zmm0[48],zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -5760,8 +5785,8 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in ; ; AVX512BW-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[0],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[16],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[32],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero,zmm0[48],zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -5872,8 +5897,8 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.v ; ; AVX512BW-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[16],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[32],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero,zmm0[48],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -5965,8 +5990,8 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -6171,8 +6196,8 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i ; ; AVX512BW-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[0,1],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[16,17],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[32,33],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero,zmm0[48,49],zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -6289,8 +6314,8 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in. ; ; AVX512BW-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[0,1],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[16,17],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[32,33],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero,zmm0[48,49],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 @@ -6407,8 +6432,8 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i ; ; AVX512BW-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm0 +; AVX512BW-NEXT: vpaddb (%rdi), %zmm0, %zmm0 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] ; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0