Skip to content

Commit 0116da2

Browse files
committed
[VectorCombine] New folding pattern for extract/binop/shuffle chains
Resolves #144654 Part of #143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so.
1 parent 7936670 commit 0116da2

File tree

3 files changed

+403
-0
lines changed

3 files changed

+403
-0
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class VectorCombine {
130130
bool foldShuffleOfIntrinsics(Instruction &I);
131131
bool foldShuffleToIdentity(Instruction &I);
132132
bool foldShuffleFromReductions(Instruction &I);
133+
bool foldShuffleChainsToReduce(Instruction &I);
133134
bool foldCastFromReductions(Instruction &I);
134135
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
135136
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2988,6 +2989,187 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
29882989
return foldSelectShuffle(*Shuffle, true);
29892990
}
29902991

2992+
bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
2993+
auto *EEI = dyn_cast<ExtractElementInst>(&I);
2994+
if (!EEI)
2995+
return false;
2996+
2997+
std::queue<Value *> InstWorklist;
2998+
Value *InitEEV = nullptr;
2999+
Intrinsic::ID CommonOp = 0;
3000+
3001+
bool IsFirstCallInst = true;
3002+
bool ShouldBeCallInst = true;
3003+
3004+
SmallVector<Value *, 3> PrevVecV(3, nullptr);
3005+
int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
3006+
int64_t VecSize = -1;
3007+
3008+
Value *VecOp;
3009+
if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero())))
3010+
return false;
3011+
3012+
auto *FVT = dyn_cast<FixedVectorType>(VecOp->getType());
3013+
if (!FVT)
3014+
return false;
3015+
3016+
VecSize = FVT->getNumElements();
3017+
if (VecSize < 2 || (VecSize % 2) != 0)
3018+
return false;
3019+
3020+
auto *IndexOp = EEI->getIndexOperand();
3021+
if (!IndexOp)
3022+
return false;
3023+
3024+
auto *ConstIndex = dyn_cast<ConstantInt>(IndexOp);
3025+
if (ConstIndex && ConstIndex->getValue() != 0)
3026+
return false;
3027+
3028+
ShuffleMaskHalf = 1;
3029+
PrevVecV[2] = VecOp;
3030+
InitEEV = EEI;
3031+
3032+
InstWorklist.push(PrevVecV[2]);
3033+
3034+
while (!InstWorklist.empty()) {
3035+
Value *V = InstWorklist.front();
3036+
InstWorklist.pop();
3037+
3038+
auto *CI = dyn_cast<Instruction>(V);
3039+
if (!CI)
3040+
return false;
3041+
3042+
if (auto *CallI = dyn_cast<CallInst>(CI)) {
3043+
if (!ShouldBeCallInst || !PrevVecV[2])
3044+
return false;
3045+
3046+
if (!IsFirstCallInst &&
3047+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3048+
return false;
3049+
3050+
if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0]))
3051+
return false;
3052+
IsFirstCallInst = false;
3053+
3054+
auto *II = dyn_cast<IntrinsicInst>(CallI);
3055+
if (!II)
3056+
return false;
3057+
3058+
if (!CommonOp)
3059+
CommonOp = II->getIntrinsicID();
3060+
if (II->getIntrinsicID() != CommonOp)
3061+
return false;
3062+
3063+
switch (II->getIntrinsicID()) {
3064+
case Intrinsic::umin:
3065+
case Intrinsic::umax:
3066+
case Intrinsic::smin:
3067+
case Intrinsic::smax: {
3068+
auto *Op0 = CallI->getOperand(0);
3069+
auto *Op1 = CallI->getOperand(1);
3070+
PrevVecV[0] = Op0;
3071+
PrevVecV[1] = Op1;
3072+
break;
3073+
}
3074+
default:
3075+
return false;
3076+
}
3077+
ShouldBeCallInst ^= 1;
3078+
3079+
if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3080+
std::swap(PrevVecV[0], PrevVecV[1]);
3081+
InstWorklist.push(PrevVecV[1]);
3082+
InstWorklist.push(PrevVecV[0]);
3083+
} else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3084+
if (ShouldBeCallInst ||
3085+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3086+
return false;
3087+
3088+
if (SVInst != PrevVecV[1])
3089+
return false;
3090+
3091+
auto *ShuffleVec = SVInst->getOperand(0);
3092+
if (!ShuffleVec || ShuffleVec != PrevVecV[0])
3093+
return false;
3094+
3095+
SmallVector<int> CurMask;
3096+
SVInst->getShuffleMask(CurMask);
3097+
3098+
if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
3099+
return false;
3100+
ExpectedShuffleMaskHalf *= 2;
3101+
3102+
for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3103+
if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
3104+
return false;
3105+
if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3106+
return false;
3107+
}
3108+
ShuffleMaskHalf *= 2;
3109+
if (ExpectedShuffleMaskHalf == VecSize)
3110+
break;
3111+
ShouldBeCallInst ^= 1;
3112+
} else {
3113+
return false;
3114+
}
3115+
}
3116+
3117+
if (ShouldBeCallInst)
3118+
return false;
3119+
3120+
assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
3121+
"Expected Match for Vector Size and Mask Half");
3122+
3123+
Value *FinalVecV = PrevVecV[0];
3124+
auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
3125+
3126+
if (!InitEEV || !FinalVecV)
3127+
return false;
3128+
3129+
assert(FinalVecVTy && "Expected non-null value for Vector Type");
3130+
3131+
Intrinsic::ID ReducedOp = 0;
3132+
switch (CommonOp) {
3133+
case Intrinsic::umin:
3134+
ReducedOp = Intrinsic::vector_reduce_umin;
3135+
break;
3136+
case Intrinsic::umax:
3137+
ReducedOp = Intrinsic::vector_reduce_umax;
3138+
break;
3139+
case Intrinsic::smin:
3140+
ReducedOp = Intrinsic::vector_reduce_smin;
3141+
break;
3142+
case Intrinsic::smax:
3143+
ReducedOp = Intrinsic::vector_reduce_smax;
3144+
break;
3145+
default:
3146+
return false;
3147+
}
3148+
3149+
InstructionCost OrigCost = 0;
3150+
unsigned int NumLevels = Log2_64(VecSize);
3151+
3152+
for (unsigned int Level = 0; Level < NumLevels; ++Level) {
3153+
OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3154+
FinalVecVTy, FinalVecVTy);
3155+
OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
3156+
}
3157+
OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
3158+
CostKind, 0);
3159+
3160+
IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3161+
InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
3162+
3163+
if (NewCost >= OrigCost)
3164+
return false;
3165+
3166+
auto *ReducedResult =
3167+
Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3168+
replaceValue(*InitEEV, *ReducedResult);
3169+
3170+
return true;
3171+
}
3172+
29913173
/// Determine if its more efficient to fold:
29923174
/// reduce(trunc(x)) -> trunc(reduce(x)).
29933175
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3705,6 +3887,9 @@ bool VectorCombine::run() {
37053887
MadeChange |= foldShuffleFromReductions(I);
37063888
MadeChange |= foldCastFromReductions(I);
37073889
break;
3890+
case Instruction::ExtractElement:
3891+
MadeChange |= foldShuffleChainsToReduce(I);
3892+
break;
37083893
case Instruction::ICmp:
37093894
case Instruction::FCmp:
37103895
MadeChange |= foldExtractExtract(I);

0 commit comments

Comments
 (0)