@@ -170,25 +170,26 @@ define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounw
170
170
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
171
171
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
172
172
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
173
- ; AVX512-NEXT: vpcmpeqd %xmm2 , %xmm2 , %xmm2
174
- ; AVX512-NEXT: vmovdqa64 %zmm2 , %zmm1 {%k1}
175
- ; AVX512-NEXT: vpaddq %xmm1 , %xmm1 , %xmm1
176
- ; AVX512-NEXT: vpermilpd %xmm1 , %xmm0, %xmm0
177
- ; AVX512-NEXT: vpxor %xmm1 , %xmm1 , %xmm1
178
- ; AVX512-NEXT: vmovdqa64 %zmm1 , %zmm0 {%k1}
173
+ ; AVX512-NEXT: vpcmpeqd %xmm3 , %xmm3 , %xmm3
174
+ ; AVX512-NEXT: vpblendmq %zmm3 , %zmm1, %zmm3 {%k1}
175
+ ; AVX512-NEXT: vpaddq %xmm3 , %xmm3 , %xmm3
176
+ ; AVX512-NEXT: vpermilpd %xmm3 , %xmm0, %xmm0
177
+ ; AVX512-NEXT: vpcmpleuq %zmm2 , %zmm1 , %k1
178
+ ; AVX512-NEXT: vmovdqa64 %zmm0 , %zmm0 {%k1} {z }
179
179
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
180
180
; AVX512-NEXT: vzeroupper
181
181
; AVX512-NEXT: retq
182
182
;
183
183
; AVX512VL-LABEL: var_shuffle_zero_v2i64:
184
184
; AVX512VL: # %bb.0:
185
- ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
186
- ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
187
- ; AVX512VL-NEXT: vmovdqa64 %xmm2, %xmm1 {%k1}
188
- ; AVX512VL-NEXT: vpaddq %xmm1, %xmm1, %xmm1
189
- ; AVX512VL-NEXT: vpermilpd %xmm1, %xmm0, %xmm0
190
- ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
191
- ; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
185
+ ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
186
+ ; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1, %k1
187
+ ; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
188
+ ; AVX512VL-NEXT: vpblendmq %xmm3, %xmm1, %xmm3 {%k1}
189
+ ; AVX512VL-NEXT: vpaddq %xmm3, %xmm3, %xmm3
190
+ ; AVX512VL-NEXT: vpermilpd %xmm3, %xmm0, %xmm0
191
+ ; AVX512VL-NEXT: vpcmpleuq %xmm2, %xmm1, %k1
192
+ ; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z}
192
193
; AVX512VL-NEXT: retq
193
194
%cmp = icmp ugt <2 x i64 > %indices , <i64 3 , i64 3 >
194
195
%or = select <2 x i1 > %cmp , <2 x i64 > <i64 -1 , i64 -1 >, <2 x i64 > %indices
@@ -355,24 +356,26 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
355
356
; AVX512-LABEL: var_shuffle_zero_v4i32:
356
357
; AVX512: # %bb.0:
357
358
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
358
- ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
359
- ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
360
- ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
361
- ; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
362
- ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
363
- ; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
359
+ ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
360
+ ; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
361
+ ; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
362
+ ; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
363
+ ; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
364
+ ; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
365
+ ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z}
364
366
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
365
367
; AVX512-NEXT: vzeroupper
366
368
; AVX512-NEXT: retq
367
369
;
368
370
; AVX512VL-LABEL: var_shuffle_zero_v4i32:
369
371
; AVX512VL: # %bb.0:
370
- ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
371
- ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
372
- ; AVX512VL-NEXT: vmovdqa32 %xmm2, %xmm1 {%k1}
373
- ; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0
374
- ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
375
- ; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
372
+ ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
373
+ ; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1, %k1
374
+ ; AVX512VL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
375
+ ; AVX512VL-NEXT: vpblendmd %xmm3, %xmm1, %xmm3 {%k1}
376
+ ; AVX512VL-NEXT: vpermilps %xmm3, %xmm0, %xmm0
377
+ ; AVX512VL-NEXT: vpcmpleud %xmm2, %xmm1, %k1
378
+ ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
376
379
; AVX512VL-NEXT: retq
377
380
%cmp = icmp ugt <4 x i32 > %indices , <i32 3 , i32 3 , i32 3 , i32 3 >
378
381
%or = select <4 x i1 > %cmp , <4 x i32 > <i32 -1 , i32 -1 , i32 -1 , i32 -1 >, <4 x i32 > %indices
@@ -600,12 +603,12 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
600
603
;
601
604
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
602
605
; AVX512VL: # %bb.0:
603
- ; AVX512VL-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
604
- ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2 , %xmm2
605
- ; AVX512VL-NEXT: vmovdqu16 %xmm2 , %xmm1 {%k1}
606
- ; AVX512VL-NEXT: vpermw %xmm0 , %xmm1, %xmm0
607
- ; AVX512VL-NEXT: vpxor %xmm1 , %xmm1, %xmm1
608
- ; AVX512VL-NEXT: vmovdqu16 % xmm1, %xmm0 {%k1 }
606
+ ; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7]
607
+ ; AVX512VL-NEXT: vpcmpnleuw %xmm2, %xmm1 , %k1
608
+ ; AVX512VL-NEXT: vpcmpeqd %xmm3 , %xmm3, %xmm3
609
+ ; AVX512VL-NEXT: vpcmpleuw %xmm2 , %xmm1, %k2
610
+ ; AVX512VL-NEXT: vmovdqu16 %xmm3 , %xmm1 {%k1}
611
+ ; AVX512VL-NEXT: vpermw %xmm0, % xmm1, %xmm0 {%k2} {z }
609
612
; AVX512VL-NEXT: retq
610
613
%cmp = icmp ugt <8 x i16 > %indices , <i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 >
611
614
%or = select <8 x i1 > %cmp , <8 x i16 > <i16 -1 , i16 -1 , i16 -1 , i16 -1 , i16 -1 , i16 -1 , i16 -1 , i16 -1 >, <8 x i16 > %indices
@@ -923,12 +926,12 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
923
926
;
924
927
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
925
928
; AVX512VL: # %bb.0:
926
- ; AVX512VL-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
927
- ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2 , %xmm2
928
- ; AVX512VL-NEXT: vmovdqu8 %xmm2 , %xmm1 {%k1}
929
- ; AVX512VL-NEXT: vpshufb %xmm1 , %xmm0 , %xmm0
930
- ; AVX512VL-NEXT: vpxor %xmm1 , %xmm1, %xmm1
931
- ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1 }
929
+ ; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
930
+ ; AVX512VL-NEXT: vpcmpnleub %xmm2, %xmm1 , %k1
931
+ ; AVX512VL-NEXT: vpcmpeqd %xmm3 , %xmm3, %xmm3
932
+ ; AVX512VL-NEXT: vpcmpleub %xmm2 , %xmm1 , %k2
933
+ ; AVX512VL-NEXT: vmovdqu8 %xmm3 , %xmm1 {%k1}
934
+ ; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0 {%k2} {z }
932
935
; AVX512VL-NEXT: retq
933
936
%cmp = icmp ugt <16 x i8 > %indices , <i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 , i8 15 >
934
937
%or = select <16 x i1 > %cmp , <16 x i8 > <i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 , i8 -1 >, <16 x i8 > %indices
@@ -1139,25 +1142,25 @@ define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices)
1139
1142
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1140
1143
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [3,3]
1141
1144
; AVX512-NEXT: vpcmpnleuq %zmm2, %zmm1, %k1
1142
- ; AVX512-NEXT: vpcmpeqd %xmm2 , %xmm2 , %xmm2
1143
- ; AVX512-NEXT: vmovdqa64 %zmm2 , %zmm1 {%k1}
1144
- ; AVX512-NEXT: vpaddq %xmm1 , %xmm1 , %xmm1
1145
- ; AVX512-NEXT: vpermilpd %xmm1 , %xmm0, %xmm0
1146
- ; AVX512-NEXT: vxorpd %xmm1 , %xmm1 , %xmm1
1147
- ; AVX512-NEXT: vmovapd %zmm1 , %zmm0 {%k1}
1145
+ ; AVX512-NEXT: vpcmpeqd %xmm3 , %xmm3 , %xmm3
1146
+ ; AVX512-NEXT: vpblendmq %zmm3 , %zmm1, %zmm3 {%k1}
1147
+ ; AVX512-NEXT: vpaddq %xmm3 , %xmm3 , %xmm3
1148
+ ; AVX512-NEXT: vpermilpd %xmm3 , %xmm0, %xmm0
1149
+ ; AVX512-NEXT: vpcmpleuq %zmm2 , %zmm1 , %k1
1150
+ ; AVX512-NEXT: vmovapd %zmm0 , %zmm0 {%k1} {z }
1148
1151
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1149
1152
; AVX512-NEXT: vzeroupper
1150
1153
; AVX512-NEXT: retq
1151
1154
;
1152
1155
; AVX512VL-LABEL: var_shuffle_zero_v2f64:
1153
1156
; AVX512VL: # %bb.0:
1154
- ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
1155
- ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2 , %xmm2
1156
- ; AVX512VL-NEXT: vmovdqa64 %xmm2 , %xmm1 {%k1}
1157
- ; AVX512VL-NEXT: vpaddq %xmm1 , %xmm1, %xmm1
1158
- ; AVX512VL-NEXT: vpermilpd %xmm1 , %xmm0 , %xmm0
1159
- ; AVX512VL-NEXT: vxorpd %xmm1 , %xmm1, %xmm1
1160
- ; AVX512VL-NEXT: vmovapd %xmm1 , %xmm0 {%k1}
1157
+ ; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [3,3]
1158
+ ; AVX512VL-NEXT: vpcmpnleuq %xmm2, %xmm1 , %k1
1159
+ ; AVX512VL-NEXT: vpcmpeqd %xmm3 , %xmm3, %xmm3
1160
+ ; AVX512VL-NEXT: vpblendmq %xmm3 , %xmm1, %xmm3 {%k1}
1161
+ ; AVX512VL-NEXT: vpaddq %xmm3 , %xmm3 , %xmm3
1162
+ ; AVX512VL-NEXT: vpcmpleuq %xmm2 , %xmm1, %k1
1163
+ ; AVX512VL-NEXT: vpermilpd %xmm3 , %xmm0, %xmm0 {%k1} {z }
1161
1164
; AVX512VL-NEXT: retq
1162
1165
%cmp = icmp ugt <2 x i64 > %indices , <i64 3 , i64 3 >
1163
1166
%or = select <2 x i1 > %cmp , <2 x i64 > <i64 -1 , i64 -1 >, <2 x i64 > %indices
@@ -1324,24 +1327,25 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
1324
1327
; AVX512-LABEL: var_shuffle_zero_v4f32:
1325
1328
; AVX512: # %bb.0:
1326
1329
; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
1327
- ; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
1328
- ; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
1329
- ; AVX512-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
1330
- ; AVX512-NEXT: vpermilps %xmm1, %xmm0, %xmm0
1331
- ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
1332
- ; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1}
1330
+ ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
1331
+ ; AVX512-NEXT: vpcmpnleud %zmm2, %zmm1, %k1
1332
+ ; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
1333
+ ; AVX512-NEXT: vpblendmd %zmm3, %zmm1, %zmm3 {%k1}
1334
+ ; AVX512-NEXT: vpermilps %xmm3, %xmm0, %xmm0
1335
+ ; AVX512-NEXT: vpcmpleud %zmm2, %zmm1, %k1
1336
+ ; AVX512-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
1333
1337
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
1334
1338
; AVX512-NEXT: vzeroupper
1335
1339
; AVX512-NEXT: retq
1336
1340
;
1337
1341
; AVX512VL-LABEL: var_shuffle_zero_v4f32:
1338
1342
; AVX512VL: # %bb.0:
1339
- ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
1340
- ; AVX512VL-NEXT: vpcmpeqd %xmm2, %xmm2 , %xmm2
1341
- ; AVX512VL-NEXT: vmovdqa32 %xmm2 , %xmm1 {%k1}
1342
- ; AVX512VL-NEXT: vpermilps %xmm1 , %xmm0 , %xmm0
1343
- ; AVX512VL-NEXT: vxorps %xmm1 , %xmm1, %xmm1
1344
- ; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 {%k1 }
1343
+ ; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
1344
+ ; AVX512VL-NEXT: vpcmpnleud %xmm2, %xmm1 , %k1
1345
+ ; AVX512VL-NEXT: vpcmpeqd %xmm3 , %xmm3, %xmm3
1346
+ ; AVX512VL-NEXT: vpcmpleud %xmm2 , %xmm1 , %k2
1347
+ ; AVX512VL-NEXT: vmovdqa32 %xmm3 , %xmm1 {%k1}
1348
+ ; AVX512VL-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k2} {z }
1345
1349
; AVX512VL-NEXT: retq
1346
1350
%cmp = icmp ugt <4 x i32 > %indices , <i32 3 , i32 3 , i32 3 , i32 3 >
1347
1351
%or = select <4 x i1 > %cmp , <4 x i32 > <i32 -1 , i32 -1 , i32 -1 , i32 -1 >, <4 x i32 > %indices
0 commit comments