Skip to content

Commit 44a3268

Browse files
committed
[VectorCombine] New folding pattern for extract/binop/shuffle chains
Resolves #144654 Part of #143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so.
1 parent 7936670 commit 44a3268

File tree

3 files changed

+504
-0
lines changed

3 files changed

+504
-0
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class VectorCombine {
130130
bool foldShuffleOfIntrinsics(Instruction &I);
131131
bool foldShuffleToIdentity(Instruction &I);
132132
bool foldShuffleFromReductions(Instruction &I);
133+
bool foldShuffleChainsToReduce(Instruction &I);
133134
bool foldCastFromReductions(Instruction &I);
134135
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
135136
bool foldInterleaveIntrinsics(Instruction &I);
@@ -2988,6 +2989,179 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
29882989
return foldSelectShuffle(*Shuffle, true);
29892990
}
29902991

2992+
bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
2993+
auto *EEI = dyn_cast<ExtractElementInst>(&I);
2994+
if (!EEI)
2995+
return false;
2996+
2997+
std::queue<Value *> InstWorklist;
2998+
Value *InitEEV = nullptr;
2999+
Intrinsic::ID CommonOp = 0;
3000+
3001+
bool IsFirstCallInst = true;
3002+
bool ShouldBeCallInst = true;
3003+
3004+
SmallVector<Value *, 3> PrevVecV(3, nullptr);
3005+
int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
3006+
int64_t VecSize = -1;
3007+
3008+
Value *VecOp;
3009+
if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero())))
3010+
return false;
3011+
3012+
auto *FVT = dyn_cast<FixedVectorType>(VecOp->getType());
3013+
if (!FVT)
3014+
return false;
3015+
3016+
VecSize = FVT->getNumElements();
3017+
if (VecSize < 2 || (VecSize % 2) != 0)
3018+
return false;
3019+
3020+
ShuffleMaskHalf = 1;
3021+
PrevVecV[2] = VecOp;
3022+
InitEEV = EEI;
3023+
3024+
InstWorklist.push(PrevVecV[2]);
3025+
3026+
while (!InstWorklist.empty()) {
3027+
Value *V = InstWorklist.front();
3028+
InstWorklist.pop();
3029+
3030+
auto *CI = dyn_cast<Instruction>(V);
3031+
if (!CI)
3032+
return false;
3033+
3034+
if (auto *CallI = dyn_cast<CallInst>(CI)) {
3035+
if (!ShouldBeCallInst || !PrevVecV[2])
3036+
return false;
3037+
3038+
if (!IsFirstCallInst &&
3039+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3040+
return false;
3041+
3042+
if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0]))
3043+
return false;
3044+
IsFirstCallInst = false;
3045+
3046+
auto *II = dyn_cast<IntrinsicInst>(CallI);
3047+
if (!II)
3048+
return false;
3049+
3050+
if (!CommonOp)
3051+
CommonOp = II->getIntrinsicID();
3052+
if (II->getIntrinsicID() != CommonOp)
3053+
return false;
3054+
3055+
switch (II->getIntrinsicID()) {
3056+
case Intrinsic::umin:
3057+
case Intrinsic::umax:
3058+
case Intrinsic::smin:
3059+
case Intrinsic::smax: {
3060+
auto *Op0 = CallI->getOperand(0);
3061+
auto *Op1 = CallI->getOperand(1);
3062+
PrevVecV[0] = Op0;
3063+
PrevVecV[1] = Op1;
3064+
break;
3065+
}
3066+
default:
3067+
return false;
3068+
}
3069+
ShouldBeCallInst ^= 1;
3070+
3071+
if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3072+
std::swap(PrevVecV[0], PrevVecV[1]);
3073+
InstWorklist.push(PrevVecV[1]);
3074+
InstWorklist.push(PrevVecV[0]);
3075+
} else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3076+
if (ShouldBeCallInst ||
3077+
any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3078+
return false;
3079+
3080+
if (SVInst != PrevVecV[1])
3081+
return false;
3082+
3083+
auto *ShuffleVec = SVInst->getOperand(0);
3084+
if (!ShuffleVec || ShuffleVec != PrevVecV[0])
3085+
return false;
3086+
3087+
SmallVector<int> CurMask;
3088+
SVInst->getShuffleMask(CurMask);
3089+
3090+
if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
3091+
return false;
3092+
ExpectedShuffleMaskHalf *= 2;
3093+
3094+
for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3095+
if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
3096+
return false;
3097+
if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3098+
return false;
3099+
}
3100+
ShuffleMaskHalf *= 2;
3101+
if (ExpectedShuffleMaskHalf == VecSize)
3102+
break;
3103+
ShouldBeCallInst ^= 1;
3104+
} else {
3105+
return false;
3106+
}
3107+
}
3108+
3109+
if (ShouldBeCallInst)
3110+
return false;
3111+
3112+
assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
3113+
"Expected Match for Vector Size and Mask Half");
3114+
3115+
Value *FinalVecV = PrevVecV[0];
3116+
auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
3117+
3118+
if (!InitEEV || !FinalVecV)
3119+
return false;
3120+
3121+
assert(FinalVecVTy && "Expected non-null value for Vector Type");
3122+
3123+
Intrinsic::ID ReducedOp = 0;
3124+
switch (CommonOp) {
3125+
case Intrinsic::umin:
3126+
ReducedOp = Intrinsic::vector_reduce_umin;
3127+
break;
3128+
case Intrinsic::umax:
3129+
ReducedOp = Intrinsic::vector_reduce_umax;
3130+
break;
3131+
case Intrinsic::smin:
3132+
ReducedOp = Intrinsic::vector_reduce_smin;
3133+
break;
3134+
case Intrinsic::smax:
3135+
ReducedOp = Intrinsic::vector_reduce_smax;
3136+
break;
3137+
default:
3138+
return false;
3139+
}
3140+
3141+
InstructionCost OrigCost = 0;
3142+
unsigned int NumLevels = Log2_64(VecSize);
3143+
3144+
for (unsigned int Level = 0; Level < NumLevels; ++Level) {
3145+
OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
3146+
FinalVecVTy, FinalVecVTy);
3147+
OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
3148+
}
3149+
OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
3150+
CostKind, 0);
3151+
3152+
IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3153+
InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
3154+
3155+
if (NewCost >= OrigCost)
3156+
return false;
3157+
3158+
auto *ReducedResult =
3159+
Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3160+
replaceValue(*InitEEV, *ReducedResult);
3161+
3162+
return true;
3163+
}
3164+
29913165
/// Determine if its more efficient to fold:
29923166
/// reduce(trunc(x)) -> trunc(reduce(x)).
29933167
/// reduce(sext(x)) -> sext(reduce(x)).
@@ -3705,6 +3879,9 @@ bool VectorCombine::run() {
37053879
MadeChange |= foldShuffleFromReductions(I);
37063880
MadeChange |= foldCastFromReductions(I);
37073881
break;
3882+
case Instruction::ExtractElement:
3883+
MadeChange |= foldShuffleChainsToReduce(I);
3884+
break;
37083885
case Instruction::ICmp:
37093886
case Instruction::FCmp:
37103887
MadeChange |= foldExtractExtract(I);

0 commit comments

Comments
 (0)