Skip to content

Commit 430a54b

Browse files
committed
Combine and(X, shuffle(X, pow 2 mask)) to all true
Combine N = and(X, shuffle_vector(X, power of 2 mask)) to all true. Where X is either N or setcc(v, <0>, ne) or a bitcast of said setcc.
1 parent ffb724a commit 430a54b

File tree

2 files changed

+93
-36
lines changed

2 files changed

+93
-36
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
#include "WebAssemblySubtarget.h"
1919
#include "WebAssemblyTargetMachine.h"
2020
#include "WebAssemblyUtilities.h"
21+
#include "llvm/ADT/SmallVector.h"
2122
#include "llvm/CodeGen/CallingConvLower.h"
2223
#include "llvm/CodeGen/MachineFrameInfo.h"
2324
#include "llvm/CodeGen/MachineInstrBuilder.h"
2425
#include "llvm/CodeGen/MachineJumpTableInfo.h"
2526
#include "llvm/CodeGen/MachineModuleInfo.h"
2627
#include "llvm/CodeGen/MachineRegisterInfo.h"
28+
#include "llvm/CodeGen/SDPatternMatch.h"
2729
#include "llvm/CodeGen/SelectionDAG.h"
2830
#include "llvm/CodeGen/SelectionDAGNodes.h"
2931
#include "llvm/IR/DiagnosticInfo.h"
@@ -184,6 +186,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
184186
// Combine partial.reduce.add before legalization gets confused.
185187
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
186188

189+
// Combine EXTRACT VECTOR ELT of AND(AND(X, SHUFFLE(X)), SHUFFLE(...)), 0
190+
// to all_true
191+
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
192+
187193
// Combine wide-vector muls, with extend inputs, to extmul_half.
188194
setTargetDAGCombine(ISD::MUL);
189195

@@ -3287,6 +3293,85 @@ static SDValue performSETCCCombine(SDNode *N,
32873293

32883294
return SDValue();
32893295
}
3296+
static SmallVector<int> buildMaskArrayByPower(int Power, size_t NumElements) {
3297+
// Generate 1-index array of elements from 2^Power to 2^(Power+1) exclusive
3298+
// The rest is filled with -1.
3299+
//
3300+
// For example, with NumElements = 4:
3301+
// When Power = 0: <1 -1 -1 -1>
3302+
// When Power = 1: <2 3 -1 -1>
3303+
// When Power = 2: <4 5 6 7>
3304+
3305+
uint From = pow(2, Power), To = pow(2, Power + 1);
3306+
assert(From < NumElements && To <= NumElements);
3307+
3308+
SmallVector<int> Res;
3309+
for (uint I = From; I < To; I++)
3310+
Res.push_back(I);
3311+
Res.resize(NumElements, -1);
3312+
3313+
return Res;
3314+
}
3315+
static SDValue matchAndOfShuffle(SDNode *N, int Power) {
3316+
// Matching on the case of
3317+
//
3318+
// Base case: A [bitcast for a] setcc(v, <0>, ne).
3319+
// Recursive case: N = and(X, shuffle(X, power mask)) where X is either
3320+
// recursive or base case.
3321+
using namespace llvm::SDPatternMatch;
3322+
3323+
EVT VT = N->getValueType(0);
3324+
3325+
SDValue LHS = N->getOperand(0);
3326+
int NumElements = VT.getVectorNumElements();
3327+
if (NumElements < pow(2, Power))
3328+
return SDValue();
3329+
3330+
if (N->getOpcode() != ISD::AND && NumElements == pow(2, Power)) {
3331+
SDValue BitCast, Matched;
3332+
3333+
// Try for a setcc first.
3334+
if (sd_match(N, m_c_SetCC(m_Value(Matched), m_Zero(),
3335+
m_SpecificCondCode(ISD::SETNE))))
3336+
return Matched;
3337+
3338+
// Now try for bitcast
3339+
if (!sd_match(N, m_BitCast(m_Value(BitCast))))
3340+
return SDValue();
3341+
3342+
if (!sd_match(BitCast, m_c_SetCC(m_Value(Matched), m_Zero(),
3343+
m_SpecificCondCode(ISD::SETNE))))
3344+
return SDValue();
3345+
return Matched;
3346+
}
3347+
3348+
SmallVector<int> PowerIndices = buildMaskArrayByPower(Power, NumElements);
3349+
if (sd_match(N, m_And(m_Value(LHS),
3350+
m_Shuffle(m_Value(LHS), m_VectorVT(m_Opc(ISD::POISON)),
3351+
m_SpecificMask(PowerIndices)))))
3352+
return matchAndOfShuffle(LHS.getNode(), Power + 1);
3353+
3354+
return SDValue();
3355+
}
3356+
static SDValue performExtractVecEltCombine(SDNode *N, SelectionDAG &DAG) {
3357+
using namespace llvm::SDPatternMatch;
3358+
3359+
assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
3360+
SDLoc DL(N);
3361+
3362+
SDValue And;
3363+
if (!sd_match(N, m_ExtractElt(m_VectorVT(m_Value(And)), m_Zero())))
3364+
return SDValue();
3365+
3366+
if (SDValue Matched = matchAndOfShuffle(And.getNode(), 0))
3367+
return DAG.getZExtOrTrunc(
3368+
DAG.getNode(
3369+
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
3370+
{DAG.getConstant(Intrinsic::wasm_alltrue, DL, MVT::i32), Matched}),
3371+
DL, N->getValueType(0));
3372+
3373+
return SDValue();
3374+
}
32903375

32913376
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
32923377
assert(N->getOpcode() == ISD::MUL);
@@ -3402,6 +3487,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
34023487
return performTruncateCombine(N, DCI);
34033488
case ISD::INTRINSIC_WO_CHAIN:
34043489
return performLowerPartialReduction(N, DCI.DAG);
3490+
case ISD::EXTRACT_VECTOR_ELT:
3491+
return performExtractVecEltCombine(N, DCI.DAG);
34053492
case ISD::MUL:
34063493
return performMulCombine(N, DCI.DAG);
34073494
}

llvm/test/CodeGen/WebAssembly/simd-reduceand.ll

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,8 @@ define i1 @reduce_and_to_all_true_16i8(<16 x i8> %0) {
66
; CHECK-LABEL: reduce_and_to_all_true_16i8:
77
; CHECK: .functype reduce_and_to_all_true_16i8 (v128) -> (i32)
88
; CHECK-NEXT: # %bb.0:
9-
; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
10-
; CHECK-NEXT: i8x16.ne $push10=, $0, $pop0
11-
; CHECK-NEXT: local.tee $push9=, $0=, $pop10
12-
; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
13-
; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
14-
; CHECK-NEXT: local.tee $push7=, $0=, $pop8
15-
; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
16-
; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
17-
; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
18-
; CHECK-NEXT: i32.const $push5=, 0
19-
; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
20-
; CHECK-NEXT: return $pop6
9+
; CHECK-NEXT: i8x16.all_true $push0=, $0
10+
; CHECK-NEXT: return $pop0
2111
%2 = icmp ne <16 x i8> %0, zeroinitializer
2212
%3 = sext <16 x i1> %2 to <16 x i8>
2313
%4 = bitcast <16 x i8> %3 to <4 x i32>
@@ -31,18 +21,8 @@ define i1 @reduce_and_to_all_true_4i32(<4 x i32> %0) {
3121
; CHECK-LABEL: reduce_and_to_all_true_4i32:
3222
; CHECK: .functype reduce_and_to_all_true_4i32 (v128) -> (i32)
3323
; CHECK-NEXT: # %bb.0:
34-
; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
35-
; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
36-
; CHECK-NEXT: local.tee $push9=, $0=, $pop10
37-
; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
38-
; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
39-
; CHECK-NEXT: local.tee $push7=, $0=, $pop8
40-
; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
41-
; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
42-
; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
43-
; CHECK-NEXT: i32.const $push5=, 0
44-
; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
45-
; CHECK-NEXT: return $pop6
24+
; CHECK-NEXT: i32x4.all_true $push0=, $0
25+
; CHECK-NEXT: return $pop0
4626
%2 = icmp ne <4 x i32> %0, zeroinitializer
4727
%3 = sext <4 x i1> %2 to <4 x i32>
4828
%4 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %3)
@@ -56,18 +36,8 @@ define i1 @reduce_and_to_all_true_2i64(<2 x i64> %0) {
5636
; CHECK-LABEL: reduce_and_to_all_true_2i64:
5737
; CHECK: .functype reduce_and_to_all_true_2i64 (v128) -> (i32)
5838
; CHECK-NEXT: # %bb.0:
59-
; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
60-
; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
61-
; CHECK-NEXT: local.tee $push9=, $0=, $pop10
62-
; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
63-
; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
64-
; CHECK-NEXT: local.tee $push7=, $0=, $pop8
65-
; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
66-
; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
67-
; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
68-
; CHECK-NEXT: i32.const $push5=, 0
69-
; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
70-
; CHECK-NEXT: return $pop6
39+
; CHECK-NEXT: i32x4.all_true $push0=, $0
40+
; CHECK-NEXT: return $pop0
7141
%2 = bitcast <2 x i64> %0 to <4 x i32>
7242
%3 = icmp ne <4 x i32> %2, zeroinitializer
7343
%4 = sext <4 x i1> %3 to <4 x i32>

0 commit comments

Comments
 (0)