Skip to content

Commit a992805

Browse files
committed
[RFC][WIP][X86] Attempt to move the AVX512 VSELECT(COND, 0, X) -> VSELECT(!COND, X, 0) fold to DAGToDAG
Initial attempt to remove the fold out of ISel to avoid regressions identified in llvm#145473 It still doesn't handle predicate widening which might not be very pretty......
1 parent a945fb1 commit a992805

File tree

6 files changed

+217
-179
lines changed

6 files changed

+217
-179
lines changed

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,24 +1139,51 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
11391139
break;
11401140
}
11411141
case ISD::VSELECT: {
1142-
// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
1143-
EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();
1144-
if (EleVT == MVT::i1)
1145-
break;
1146-
1147-
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1148-
assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1149-
"We can't replace VSELECT with BLENDV in vXi16!");
1142+
SDValue Cond = N->getOperand(0);
1143+
SDValue LHS = N->getOperand(1);
1144+
SDValue RHS = N->getOperand(2);
1145+
EVT CondVT = Cond.getValueType();
1146+
EVT EleVT = CondVT.getVectorElementType();
11501147
SDValue R;
1151-
if (Subtarget->hasVLX() && CurDAG->ComputeNumSignBits(N->getOperand(0)) ==
1152-
EleVT.getSizeInBits()) {
1153-
R = CurDAG->getNode(X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0),
1154-
N->getOperand(0), N->getOperand(1), N->getOperand(2),
1155-
CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1148+
1149+
if (EleVT == MVT::i1) {
1150+
assert(Subtarget->hasAVX512() && "Expected AVX512 support!");
1151+
if (!ISD::isBuildVectorAllZeros(LHS.getNode()) ||
1152+
ISD::isBuildVectorAllZeros(RHS.getNode()))
1153+
break;
1154+
// If this an avx512 target we can improve the use of zero masking by
1155+
// swapping the operands and inverting the condition.
1156+
// vselect cond, op1, op2 = vselect not(cond), op2, op1
1157+
if (Cond.getOpcode() == ISD::SETCC &&
1158+
!ISD::isBuildVectorAllZeros(Cond.getOperand(0).getNode())) {
1159+
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
1160+
CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
1161+
R = CurDAG->getSetCC(SDLoc(N), CondVT, Cond.getOperand(0),
1162+
Cond.getOperand(1), CC);
1163+
} else if (Cond.getOpcode() == X86ISD::CMPM &&
1164+
Cond.getConstantOperandVal(2) == 0) {
1165+
// FLIP FCMP EQ -> (U)NE
1166+
R = CurDAG->getNode(Cond.getOpcode(), SDLoc(N), CondVT,
1167+
Cond.getOperand(0), Cond.getOperand(1),
1168+
CurDAG->getTargetConstant(4, SDLoc(N), MVT::i8));
1169+
} else {
1170+
R = CurDAG->getNOT(SDLoc(N), Cond, CondVT);
1171+
}
1172+
R = CurDAG->getSelect(SDLoc(N), N->getValueType(0), R, RHS, LHS);
11561173
} else {
1157-
R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1158-
N->getOperand(0), N->getOperand(1),
1159-
N->getOperand(2));
1174+
// Replace VSELECT with non-mask conditions with BLENDV/VPTERNLOG.
1175+
assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
1176+
assert(N->getValueType(0).getVectorElementType() != MVT::i16 &&
1177+
"We can't replace VSELECT with BLENDV in vXi16!");
1178+
if (Subtarget->hasVLX() &&
1179+
CurDAG->ComputeNumSignBits(Cond) == EleVT.getSizeInBits()) {
1180+
R = CurDAG->getNode(
1181+
X86ISD::VPTERNLOG, SDLoc(N), N->getValueType(0), Cond, LHS, RHS,
1182+
CurDAG->getTargetConstant(0xCA, SDLoc(N), MVT::i8));
1183+
} else {
1184+
R = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
1185+
Cond, LHS, RHS);
1186+
}
11601187
}
11611188
--I;
11621189
CurDAG->ReplaceAllUsesWith(N, R.getNode());

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5415,6 +5415,20 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
54155415
}
54165416
}
54175417

5418+
// Match not(insert_subvector(undef, setcc(), c))
5419+
// --> insert_subvector(undef, not(setcc()), c)
5420+
if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
5421+
V.getOperand(1).getOpcode() == ISD::SETCC &&
5422+
V.getValueType().getScalarType() == MVT::i1) {
5423+
SDValue Cond = V.getOperand(1);
5424+
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
5425+
CC = ISD::getSetCCInverse(CC, Cond.getOperand(0).getValueType());
5426+
SDValue NotSub = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
5427+
Cond.getOperand(0), Cond.getOperand(1), CC);
5428+
return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(V), V.getValueType(),
5429+
V.getOperand(0), NotSub, V.getOperand(2));
5430+
}
5431+
54185432
// Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
54195433
SmallVector<SDValue, 2> CatOps;
54205434
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
@@ -48049,19 +48063,6 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
4804948063
}
4805048064
}
4805148065

48052-
// Check if the first operand is all zeros and Cond type is vXi1.
48053-
// If this an avx512 target we can improve the use of zero masking by
48054-
// swapping the operands and inverting the condition.
48055-
if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
48056-
Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
48057-
ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48058-
!ISD::isBuildVectorAllZeros(RHS.getNode())) {
48059-
// Invert the cond to not(cond) : xor(op,allones)=not(op)
48060-
SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
48061-
// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
48062-
return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
48063-
}
48064-
4806548066
// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
4806648067
// get split by legalization.
4806748068
if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
@@ -48125,11 +48126,14 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
4812548126
return V;
4812648127

4812748128
// select(~Cond, X, Y) -> select(Cond, Y, X)
48128-
if (CondVT.getScalarType() != MVT::i1) {
48129+
if (CondVT.getScalarType() != MVT::i1 ||
48130+
(ISD::isBuildVectorAllZeros(LHS.getNode()) &&
48131+
!ISD::isBuildVectorAllZeros(RHS.getNode())))
4812948132
if (SDValue CondNot = IsNOT(Cond, DAG))
4813048133
return DAG.getNode(N->getOpcode(), DL, VT,
4813148134
DAG.getBitcast(CondVT, CondNot), RHS, LHS);
4813248135

48136+
if (CondVT.getScalarType() != MVT::i1) {
4813348137
// select(pcmpeq(and(X,Pow2),0),A,B) -> select(pcmpeq(and(X,Pow2),Pow2),B,A)
4813448138
if (Cond.getOpcode() == X86ISD::PCMPEQ &&
4813548139
Cond.getOperand(0).getOpcode() == ISD::AND &&

llvm/test/CodeGen/X86/extract-vselect-setcc.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ define void @PR117684(i1 %cond, <8 x float> %vec, ptr %ptr1, ptr %ptr2) #0 {
55
; CHECK-LABEL: PR117684:
66
; CHECK: # %bb.0:
77
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
8-
; CHECK-NEXT: vcmpnltss %xmm1, %xmm0, %k1
8+
; CHECK-NEXT: vcmpltss %xmm1, %xmm0, %k0
9+
; CHECK-NEXT: knotb %k0, %k1
910
; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [NaN,NaN,NaN,NaN]
1011
; CHECK-NEXT: vinsertf32x4 $0, %xmm0, %ymm0, %ymm0 {%k1} {z}
1112
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0

llvm/test/CodeGen/X86/psubus.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -981,9 +981,9 @@ define <16 x i8> @test14(<16 x i8> %x, <16 x i32> %y) nounwind {
981981
; AVX512-LABEL: test14:
982982
; AVX512: # %bb.0: # %vector.ph
983983
; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
984+
; AVX512-NEXT: vpmovdb %zmm1, %xmm3
984985
; AVX512-NEXT: vpcmpnltud %zmm2, %zmm1, %k1
985-
; AVX512-NEXT: vpmovdb %zmm1, %xmm1
986-
; AVX512-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} {z}
986+
; AVX512-NEXT: vpsubb %xmm0, %xmm3, %xmm0 {%k1} {z}
987987
; AVX512-NEXT: vzeroupper
988988
; AVX512-NEXT: retq
989989
vector.ph:

llvm/test/CodeGen/X86/var-permute-128.ll

Lines changed: 66 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -170,25 +170,26 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
170170
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
171171
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
172172
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
173-
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
174-
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
175-
; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
176-
; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
177-
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
178-
; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
173+
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
174+
; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
175+
; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
176+
; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
177+
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
178+
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z}
179179
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
180180
; AVX512-NEXT: vzeroupper
181181
; AVX512-NEXT: retq
182182
;
183183
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
184184
; AVX512VL: # %bb.0:
185-
; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
186-
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
187-
; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
188-
; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
189-
; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
190-
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
191-
; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
185+
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
186+
; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
187+
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
188+
; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
189+
; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
190+
; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
191+
; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
192+
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
192193
; AVX512VL-NEXT: retq
193194
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
194195
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -355,24 +356,26 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
355356
; AVX512-LABEL: var_shuffle_zero_v4i32:
356357
; AVX512: # %bb.0:
357358
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
358-
; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
359-
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
360-
; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
361-
; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
362-
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
363-
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
359+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
360+
; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
361+
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
362+
; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
363+
; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
364+
; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
365+
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
364366
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
365367
; AVX512-NEXT: vzeroupper
366368
; AVX512-NEXT: retq
367369
;
368370
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
369371
; AVX512VL: # %bb.0:
370-
; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
371-
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
372-
; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
373-
; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
374-
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
375-
; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
372+
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
373+
; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
374+
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
375+
; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
376+
; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
377+
; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
378+
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
376379
; AVX512VL-NEXT: retq
377380
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
378381
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
@@ -600,12 +603,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
600603
;
601604
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
602605
; AVX512VL: # %bb.0:
603-
; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
604-
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
605-
; AVX512VL-NEXT: vmovdqu16 %xmm2, %xmm1 {%k1}
606-
; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0
607-
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
608-
; AVX512VL-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
606+
; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
607+
; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1, %k1
608+
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
609+
; AVX512VL-NEXT: vpcmpleuw %xmm2, %xmm1, %k2
610+
; AVX512VL-NEXT: vmovdqu16 %xmm3, %xmm1 {%k1}
611+
; AVX512VL-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k2} {z}
609612
; AVX512VL-NEXT: retq
610613
%cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
611614
%or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
@@ -923,12 +926,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
923926
;
924927
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
925928
; AVX512VL: # %bb.0:
926-
; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
927-
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
928-
; AVX512VL-NEXT: vmovdqu8 %xmm2, %xmm1 {%k1}
929-
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
930-
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
931-
; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
929+
; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
930+
; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1, %k1
931+
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
932+
; AVX512VL-NEXT: vpcmpleub %xmm2, %xmm1, %k2
933+
; AVX512VL-NEXT: vmovdqu8 %xmm3, %xmm1 {%k1}
934+
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z}
932935
; AVX512VL-NEXT: retq
933936
%cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
934937
%or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
@@ -1139,25 +1142,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
11391142
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
11401143
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
11411144
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
1142-
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1143-
; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
1144-
; AVX512-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1145-
; AVX512-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1146-
; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1147-
; AVX512-NEXT: vmovapd %zmm1, %zmm0 {%k1}
1145+
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1146+
; AVX512-NEXT: vpblendmq %zmm3, %zmm1, %zmm3 {%k1}
1147+
; AVX512-NEXT: vpaddq %xmm3, %xmm3, %xmm3
1148+
; AVX512-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
1149+
; AVX512-NEXT: vpcmpleuq %zmm2, %zmm1, %k1
1150+
; AVX512-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
11481151
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
11491152
; AVX512-NEXT: vzeroupper
11501153
; AVX512-NEXT: retq
11511154
;
11521155
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
11531156
; AVX512VL: # %bb.0:
1154-
; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
1155-
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1156-
; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
1157-
; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
1158-
; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
1159-
; AVX512VL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
1160-
; AVX512VL-NEXT: vmovapd %xmm1, %xmm0 {%k1}
1157+
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
1158+
; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
1159+
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1160+
; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
1161+
; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
1162+
; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
1163+
; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0 {%k1} {z}
11611164
; AVX512VL-NEXT: retq
11621165
%cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
11631166
%or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
@@ -1324,24 +1327,25 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
13241327
; AVX512-LABEL: var_shuffle_zero_v4f32:
13251328
; AVX512: # %bb.0:
13261329
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1327-
; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
1328-
; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1329-
; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
1330-
; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1331-
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1332-
; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
1330+
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
1331+
; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
1332+
; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1333+
; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
1334+
; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
1335+
; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
1336+
; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
13331337
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
13341338
; AVX512-NEXT: vzeroupper
13351339
; AVX512-NEXT: retq
13361340
;
13371341
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
13381342
; AVX512VL: # %bb.0:
1339-
; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
1340-
; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1341-
; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
1342-
; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1343-
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
1344-
; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1}
1343+
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
1344+
; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
1345+
; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1346+
; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k2
1347+
; AVX512VL-NEXT: vmovdqa32 %xmm3, %xmm1 {%k1}
1348+
; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z}
13451349
; AVX512VL-NEXT: retq
13461350
%cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
13471351
%or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices

0 commit comments

Comments
 (0)