@@ -130,6 +130,7 @@ class VectorCombine {
130
130
bool foldShuffleOfIntrinsics (Instruction &I);
131
131
bool foldShuffleToIdentity (Instruction &I);
132
132
bool foldShuffleFromReductions (Instruction &I);
133
+ bool foldShuffleChainsToReduce (Instruction &I);
133
134
bool foldCastFromReductions (Instruction &I);
134
135
bool foldSelectShuffle (Instruction &I, bool FromReduction = false );
135
136
bool foldInterleaveIntrinsics (Instruction &I);
@@ -2988,6 +2989,187 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
2988
2989
return foldSelectShuffle (*Shuffle, true );
2989
2990
}
2990
2991
2992
+ bool VectorCombine::foldShuffleChainsToReduce (Instruction &I) {
2993
+ auto *EEI = dyn_cast<ExtractElementInst>(&I);
2994
+ if (!EEI)
2995
+ return false ;
2996
+
2997
+ std::queue<Value *> InstWorklist;
2998
+ Value *InitEEV = nullptr ;
2999
+ Intrinsic::ID CommonOp = 0 ;
3000
+
3001
+ bool IsFirstCallInst = true ;
3002
+ bool ShouldBeCallInst = true ;
3003
+
3004
+ SmallVector<Value *, 3 > PrevVecV (3 , nullptr );
3005
+ int64_t ShuffleMaskHalf = -1 , ExpectedShuffleMaskHalf = 1 ;
3006
+ int64_t VecSize = -1 ;
3007
+
3008
+ Value *VecOp;
3009
+ if (!match (&I, m_ExtractElt (m_Value (VecOp), m_Zero ())))
3010
+ return false ;
3011
+
3012
+ auto *FVT = dyn_cast<FixedVectorType>(VecOp->getType ());
3013
+ if (!FVT)
3014
+ return false ;
3015
+
3016
+ VecSize = FVT->getNumElements ();
3017
+ if (VecSize < 2 || (VecSize % 2 ) != 0 )
3018
+ return false ;
3019
+
3020
+ auto *IndexOp = EEI->getIndexOperand ();
3021
+ if (!IndexOp)
3022
+ return false ;
3023
+
3024
+ auto *ConstIndex = dyn_cast<ConstantInt>(IndexOp);
3025
+ if (ConstIndex && ConstIndex->getValue () != 0 )
3026
+ return false ;
3027
+
3028
+ ShuffleMaskHalf = 1 ;
3029
+ PrevVecV[2 ] = VecOp;
3030
+ InitEEV = EEI;
3031
+
3032
+ InstWorklist.push (PrevVecV[2 ]);
3033
+
3034
+ while (!InstWorklist.empty ()) {
3035
+ Value *V = InstWorklist.front ();
3036
+ InstWorklist.pop ();
3037
+
3038
+ auto *CI = dyn_cast<Instruction>(V);
3039
+ if (!CI)
3040
+ return false ;
3041
+
3042
+ if (auto *CallI = dyn_cast<CallInst>(CI)) {
3043
+ if (!ShouldBeCallInst || !PrevVecV[2 ])
3044
+ return false ;
3045
+
3046
+ if (!IsFirstCallInst &&
3047
+ any_of (PrevVecV, [](Value *VecV) { return VecV == nullptr ; }))
3048
+ return false ;
3049
+
3050
+ if (CallI != (IsFirstCallInst ? PrevVecV[2 ] : PrevVecV[0 ]))
3051
+ return false ;
3052
+ IsFirstCallInst = false ;
3053
+
3054
+ auto *II = dyn_cast<IntrinsicInst>(CallI);
3055
+ if (!II)
3056
+ return false ;
3057
+
3058
+ if (!CommonOp)
3059
+ CommonOp = II->getIntrinsicID ();
3060
+ if (II->getIntrinsicID () != CommonOp)
3061
+ return false ;
3062
+
3063
+ switch (II->getIntrinsicID ()) {
3064
+ case Intrinsic::umin:
3065
+ case Intrinsic::umax:
3066
+ case Intrinsic::smin:
3067
+ case Intrinsic::smax: {
3068
+ auto *Op0 = CallI->getOperand (0 );
3069
+ auto *Op1 = CallI->getOperand (1 );
3070
+ PrevVecV[0 ] = Op0;
3071
+ PrevVecV[1 ] = Op1;
3072
+ break ;
3073
+ }
3074
+ default :
3075
+ return false ;
3076
+ }
3077
+ ShouldBeCallInst ^= 1 ;
3078
+
3079
+ if (!isa<ShuffleVectorInst>(PrevVecV[1 ]))
3080
+ std::swap (PrevVecV[0 ], PrevVecV[1 ]);
3081
+ InstWorklist.push (PrevVecV[1 ]);
3082
+ InstWorklist.push (PrevVecV[0 ]);
3083
+ } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3084
+ if (ShouldBeCallInst ||
3085
+ any_of (PrevVecV, [](Value *VecV) { return VecV == nullptr ; }))
3086
+ return false ;
3087
+
3088
+ if (SVInst != PrevVecV[1 ])
3089
+ return false ;
3090
+
3091
+ auto *ShuffleVec = SVInst->getOperand (0 );
3092
+ if (!ShuffleVec || ShuffleVec != PrevVecV[0 ])
3093
+ return false ;
3094
+
3095
+ SmallVector<int > CurMask;
3096
+ SVInst->getShuffleMask (CurMask);
3097
+
3098
+ if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
3099
+ return false ;
3100
+ ExpectedShuffleMaskHalf *= 2 ;
3101
+
3102
+ for (int Mask = 0 , MaskSize = CurMask.size (); Mask != MaskSize; ++Mask) {
3103
+ if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
3104
+ return false ;
3105
+ if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1 )
3106
+ return false ;
3107
+ }
3108
+ ShuffleMaskHalf *= 2 ;
3109
+ if (ExpectedShuffleMaskHalf == VecSize)
3110
+ break ;
3111
+ ShouldBeCallInst ^= 1 ;
3112
+ } else {
3113
+ return false ;
3114
+ }
3115
+ }
3116
+
3117
+ if (ShouldBeCallInst)
3118
+ return false ;
3119
+
3120
+ assert (VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
3121
+ " Expected Match for Vector Size and Mask Half" );
3122
+
3123
+ Value *FinalVecV = PrevVecV[0 ];
3124
+ auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType ());
3125
+
3126
+ if (!InitEEV || !FinalVecV)
3127
+ return false ;
3128
+
3129
+ assert (FinalVecVTy && " Expected non-null value for Vector Type" );
3130
+
3131
+ Intrinsic::ID ReducedOp = 0 ;
3132
+ switch (CommonOp) {
3133
+ case Intrinsic::umin:
3134
+ ReducedOp = Intrinsic::vector_reduce_umin;
3135
+ break ;
3136
+ case Intrinsic::umax:
3137
+ ReducedOp = Intrinsic::vector_reduce_umax;
3138
+ break ;
3139
+ case Intrinsic::smin:
3140
+ ReducedOp = Intrinsic::vector_reduce_smin;
3141
+ break ;
3142
+ case Intrinsic::smax:
3143
+ ReducedOp = Intrinsic::vector_reduce_smax;
3144
+ break ;
3145
+ default :
3146
+ return false ;
3147
+ }
3148
+
3149
+ InstructionCost OrigCost = 0 ;
3150
+ unsigned int NumLevels = Log2_64 (VecSize);
3151
+
3152
+ for (unsigned int Level = 0 ; Level < NumLevels; ++Level) {
3153
+ OrigCost += TTI.getShuffleCost (TargetTransformInfo::SK_PermuteSingleSrc,
3154
+ FinalVecVTy, FinalVecVTy);
3155
+ OrigCost += TTI.getArithmeticInstrCost (Instruction::ICmp, FinalVecVTy);
3156
+ }
3157
+ OrigCost += TTI.getVectorInstrCost (Instruction::ExtractElement, FinalVecVTy,
3158
+ CostKind, 0 );
3159
+
3160
+ IntrinsicCostAttributes ICA (ReducedOp, FinalVecVTy, {FinalVecV});
3161
+ InstructionCost NewCost = TTI.getIntrinsicInstrCost (ICA, CostKind);
3162
+
3163
+ if (NewCost >= OrigCost)
3164
+ return false ;
3165
+
3166
+ auto *ReducedResult =
3167
+ Builder.CreateIntrinsic (ReducedOp, {FinalVecV->getType ()}, {FinalVecV});
3168
+ replaceValue (*InitEEV, *ReducedResult);
3169
+
3170
+ return true ;
3171
+ }
3172
+
2991
3173
// / Determine if its more efficient to fold:
2992
3174
// / reduce(trunc(x)) -> trunc(reduce(x)).
2993
3175
// / reduce(sext(x)) -> sext(reduce(x)).
@@ -3705,6 +3887,9 @@ bool VectorCombine::run() {
3705
3887
MadeChange |= foldShuffleFromReductions (I);
3706
3888
MadeChange |= foldCastFromReductions (I);
3707
3889
break ;
3890
+ case Instruction::ExtractElement:
3891
+ MadeChange |= foldShuffleChainsToReduce (I);
3892
+ break ;
3708
3893
case Instruction::ICmp:
3709
3894
case Instruction::FCmp:
3710
3895
MadeChange |= foldExtractExtract (I);
0 commit comments