Skip to content

Commit 9c7ce61

Browse files
committed
Combine and(X, shuffle(X, pow 2 mask)) to all true
Combine N = and(X, shuffle_vector(X, power of 2 mask)) to all true. Where X is either N or setcc(v, <0>, ne) or a bitcast of said setcc.
1 parent ffb724a commit 9c7ce61

File tree

2 files changed

+95
-36
lines changed

2 files changed

+95
-36
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818
#include "WebAssemblySubtarget.h"
1919
#include "WebAssemblyTargetMachine.h"
2020
#include "WebAssemblyUtilities.h"
21+
#include "llvm/ADT/SmallVector.h"
2122
#include "llvm/CodeGen/CallingConvLower.h"
2223
#include "llvm/CodeGen/MachineFrameInfo.h"
2324
#include "llvm/CodeGen/MachineInstrBuilder.h"
2425
#include "llvm/CodeGen/MachineJumpTableInfo.h"
2526
#include "llvm/CodeGen/MachineModuleInfo.h"
2627
#include "llvm/CodeGen/MachineRegisterInfo.h"
28+
#include "llvm/CodeGen/SDPatternMatch.h"
2729
#include "llvm/CodeGen/SelectionDAG.h"
2830
#include "llvm/CodeGen/SelectionDAGNodes.h"
2931
#include "llvm/IR/DiagnosticInfo.h"
@@ -184,6 +186,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
184186
// Combine partial.reduce.add before legalization gets confused.
185187
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
186188

189+
// Combine EXTRACT VECTOR ELT of AND(AND(X, SHUFFLE(X)), SHUFFLE(...)), 0
190+
// to all_true
191+
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
192+
187193
// Combine wide-vector muls, with extend inputs, to extmul_half.
188194
setTargetDAGCombine(ISD::MUL);
189195

@@ -3287,6 +3293,87 @@ static SDValue performSETCCCombine(SDNode *N,
32873293

32883294
return SDValue();
32893295
}
3296+
static SmallVector<int> buildMaskArrayByPower(unsigned FromPower,
3297+
unsigned NumElements) {
3298+
// Generate 1-index array of elements from 2^Power to 2^(Power+1) exclusive
3299+
// The rest is filled with -1.
3300+
//
3301+
// For example, with NumElements = 4:
3302+
// When Power = 1: <1 -1 -1 -1>
3303+
// When Power = 2: <2 3 -1 -1>
3304+
// When Power = 4: <4 5 6 7>
3305+
assert(FromPower <= 256);
3306+
unsigned ToPower = NextPowerOf2(FromPower);
3307+
assert(FromPower < NumElements && ToPower <= NumElements);
3308+
3309+
SmallVector<int> Res;
3310+
for (unsigned I = FromPower; I < ToPower; I++)
3311+
Res.push_back(I);
3312+
Res.resize(NumElements, -1);
3313+
3314+
return Res;
3315+
}
3316+
static SDValue matchAndOfShuffle(SDNode *N, int Power = 1) {
3317+
// Matching on the case of
3318+
//
3319+
// Base case: A [bitcast for a] setcc(v, <0>, ne).
3320+
// Recursive case: N = and(X, shuffle(X, power mask)) where X is either
3321+
// recursive or base case.
3322+
using namespace llvm::SDPatternMatch;
3323+
3324+
EVT VT = N->getValueType(0);
3325+
3326+
SDValue LHS = N->getOperand(0);
3327+
int NumElements = VT.getVectorNumElements();
3328+
3329+
if (NumElements < Power)
3330+
return SDValue();
3331+
3332+
if (N->getOpcode() != ISD::AND && NumElements == Power) {
3333+
SDValue BitCast, Matched;
3334+
3335+
// Try for a setcc first.
3336+
if (sd_match(N, m_c_SetCC(m_Value(Matched), m_Zero(),
3337+
m_SpecificCondCode(ISD::SETNE))))
3338+
return Matched;
3339+
3340+
// Now try for bitcast
3341+
if (!sd_match(N, m_BitCast(m_Value(BitCast))))
3342+
return SDValue();
3343+
3344+
if (!sd_match(BitCast, m_c_SetCC(m_Value(Matched), m_Zero(),
3345+
m_SpecificCondCode(ISD::SETNE))))
3346+
return SDValue();
3347+
return Matched;
3348+
}
3349+
3350+
SmallVector<int> PowerIndices = buildMaskArrayByPower(Power, NumElements);
3351+
if (sd_match(N, m_And(m_Value(LHS),
3352+
m_Shuffle(m_Value(LHS), m_VectorVT(m_Opc(ISD::POISON)),
3353+
m_SpecificMask(PowerIndices)))))
3354+
return matchAndOfShuffle(LHS.getNode(), NextPowerOf2(Power));
3355+
3356+
return SDValue();
3357+
}
3358+
static SDValue performExtractVecEltCombine(SDNode *N, SelectionDAG &DAG) {
3359+
using namespace llvm::SDPatternMatch;
3360+
3361+
assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
3362+
SDLoc DL(N);
3363+
3364+
SDValue And;
3365+
if (!sd_match(N, m_ExtractElt(m_VectorVT(m_Value(And)), m_Zero())))
3366+
return SDValue();
3367+
3368+
if (SDValue Matched = matchAndOfShuffle(And.getNode()))
3369+
return DAG.getZExtOrTrunc(
3370+
DAG.getNode(
3371+
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
3372+
{DAG.getConstant(Intrinsic::wasm_alltrue, DL, MVT::i32), Matched}),
3373+
DL, N->getValueType(0));
3374+
3375+
return SDValue();
3376+
}
32903377

32913378
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
32923379
assert(N->getOpcode() == ISD::MUL);
@@ -3402,6 +3489,8 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
34023489
return performTruncateCombine(N, DCI);
34033490
case ISD::INTRINSIC_WO_CHAIN:
34043491
return performLowerPartialReduction(N, DCI.DAG);
3492+
case ISD::EXTRACT_VECTOR_ELT:
3493+
return performExtractVecEltCombine(N, DCI.DAG);
34053494
case ISD::MUL:
34063495
return performMulCombine(N, DCI.DAG);
34073496
}

llvm/test/CodeGen/WebAssembly/simd-reduceand.ll

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,8 @@ define i1 @reduce_and_to_all_true_16i8(<16 x i8> %0) {
66
; CHECK-LABEL: reduce_and_to_all_true_16i8:
77
; CHECK: .functype reduce_and_to_all_true_16i8 (v128) -> (i32)
88
; CHECK-NEXT: # %bb.0:
9-
; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
10-
; CHECK-NEXT: i8x16.ne $push10=, $0, $pop0
11-
; CHECK-NEXT: local.tee $push9=, $0=, $pop10
12-
; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
13-
; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
14-
; CHECK-NEXT: local.tee $push7=, $0=, $pop8
15-
; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
16-
; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
17-
; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
18-
; CHECK-NEXT: i32.const $push5=, 0
19-
; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
20-
; CHECK-NEXT: return $pop6
9+
; CHECK-NEXT: i8x16.all_true $push0=, $0
10+
; CHECK-NEXT: return $pop0
2111
%2 = icmp ne <16 x i8> %0, zeroinitializer
2212
%3 = sext <16 x i1> %2 to <16 x i8>
2313
%4 = bitcast <16 x i8> %3 to <4 x i32>
@@ -31,18 +21,8 @@ define i1 @reduce_and_to_all_true_4i32(<4 x i32> %0) {
3121
; CHECK-LABEL: reduce_and_to_all_true_4i32:
3222
; CHECK: .functype reduce_and_to_all_true_4i32 (v128) -> (i32)
3323
; CHECK-NEXT: # %bb.0:
34-
; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
35-
; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
36-
; CHECK-NEXT: local.tee $push9=, $0=, $pop10
37-
; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
38-
; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
39-
; CHECK-NEXT: local.tee $push7=, $0=, $pop8
40-
; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
41-
; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
42-
; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
43-
; CHECK-NEXT: i32.const $push5=, 0
44-
; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
45-
; CHECK-NEXT: return $pop6
24+
; CHECK-NEXT: i32x4.all_true $push0=, $0
25+
; CHECK-NEXT: return $pop0
4626
%2 = icmp ne <4 x i32> %0, zeroinitializer
4727
%3 = sext <4 x i1> %2 to <4 x i32>
4828
%4 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %3)
@@ -56,18 +36,8 @@ define i1 @reduce_and_to_all_true_2i64(<2 x i64> %0) {
5636
; CHECK-LABEL: reduce_and_to_all_true_2i64:
5737
; CHECK: .functype reduce_and_to_all_true_2i64 (v128) -> (i32)
5838
; CHECK-NEXT: # %bb.0:
59-
; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0
60-
; CHECK-NEXT: i32x4.ne $push10=, $0, $pop0
61-
; CHECK-NEXT: local.tee $push9=, $0=, $pop10
62-
; CHECK-NEXT: i8x16.shuffle $push1=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3
63-
; CHECK-NEXT: v128.and $push8=, $pop9, $pop1
64-
; CHECK-NEXT: local.tee $push7=, $0=, $pop8
65-
; CHECK-NEXT: i8x16.shuffle $push2=, $0, $0, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
66-
; CHECK-NEXT: v128.and $push3=, $pop7, $pop2
67-
; CHECK-NEXT: i32x4.extract_lane $push4=, $pop3, 0
68-
; CHECK-NEXT: i32.const $push5=, 0
69-
; CHECK-NEXT: i32.ne $push6=, $pop4, $pop5
70-
; CHECK-NEXT: return $pop6
39+
; CHECK-NEXT: i32x4.all_true $push0=, $0
40+
; CHECK-NEXT: return $pop0
7141
%2 = bitcast <2 x i64> %0 to <4 x i32>
7242
%3 = icmp ne <4 x i32> %2, zeroinitializer
7343
%4 = sext <4 x i1> %3 to <4 x i32>

0 commit comments

Comments
 (0)