llvm · Rajveer100 · Jun 22, 2025 · Jun 28, 2025 · dtcxzyw · Jun 30, 2025
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -130,6 +130,7 @@ class VectorCombine {
   bool foldShuffleOfIntrinsics(Instruction &I);
   bool foldShuffleToIdentity(Instruction &I);
   bool foldShuffleFromReductions(Instruction &I);
+  bool foldShuffleChainsToReduce(Instruction &I);
   bool foldCastFromReductions(Instruction &I);
   bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
   bool foldInterleaveIntrinsics(Instruction &I);
@@ -2988,6 +2989,240 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
   return foldSelectShuffle(*Shuffle, true);
 }
 
+bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
+  auto *EEI = dyn_cast<ExtractElementInst>(&I);
+  if (!EEI)
+    return false;
+
+  std::queue<Value *> InstWorklist;
+  Value *InitEEV = nullptr;
+
+  unsigned int CommonCallOp = 0, CommonBinOp = 0;
+
+  bool IsFirstCallOrBinInst = true;
+  bool ShouldBeCallOrBinInst = true;
+
+  SmallVector<Value *, 3> PrevVecV(3, nullptr);
+  int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1;
+  int64_t VecSize = -1;
+
+  Value *VecOp;
+  if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero())))
+    return false;
+
+  auto *FVT = dyn_cast<FixedVectorType>(VecOp->getType());
+  if (!FVT)
+    return false;
+
+  VecSize = FVT->getNumElements();
+  if (VecSize < 2 || (VecSize % 2) != 0)
+    return false;
+
+  ShuffleMaskHalf = 1;
+  PrevVecV[2] = VecOp;
+  InitEEV = EEI;
+
+  InstWorklist.push(PrevVecV[2]);
+
+  while (!InstWorklist.empty()) {
+    Value *V = InstWorklist.front();
+    InstWorklist.pop();
+
+    auto *CI = dyn_cast<Instruction>(V);
+    if (!CI)
+      return false;
+
+    if (auto *CallI = dyn_cast<CallInst>(CI)) {
+      if (!ShouldBeCallOrBinInst || !PrevVecV[2])
+        return false;
+
+      if (!IsFirstCallOrBinInst &&
+          any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+        return false;
+
+      if (CallI != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0]))
+        return false;
+      IsFirstCallOrBinInst = false;
+
+      auto *II = dyn_cast<IntrinsicInst>(CallI);
+      if (!II)
+        return false;
+
+      if (!CommonCallOp)
+        CommonCallOp = II->getIntrinsicID();
+      if (II->getIntrinsicID() != CommonCallOp)
+        return false;
+
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::umin:
+      case Intrinsic::umax:
+      case Intrinsic::smin:
+      case Intrinsic::smax: {
+        auto *Op0 = CallI->getOperand(0);
+        auto *Op1 = CallI->getOperand(1);
+        PrevVecV[0] = Op0;
+        PrevVecV[1] = Op1;
+        break;
+      }
+      default:
+        return false;
+      }
+      ShouldBeCallOrBinInst ^= 1;
+
+      if (!isa<ShuffleVectorInst>(PrevVecV[1]))
+        std::swap(PrevVecV[0], PrevVecV[1]);
+      InstWorklist.push(PrevVecV[1]);
+      InstWorklist.push(PrevVecV[0]);
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
+      if (!ShouldBeCallOrBinInst || !PrevVecV[2])
+        return false;
+
+      if (!IsFirstCallOrBinInst &&
+          any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+        return false;
+
+      if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0]))
+        return false;
+      IsFirstCallOrBinInst = false;
+
+      if (!CommonBinOp)
+        CommonBinOp = CI->getOpcode();
+      if (CI->getOpcode() != CommonBinOp)
+        return false;
+
+      switch (CI->getOpcode()) {
+      case BinaryOperator::Add:
+      case BinaryOperator::Mul:
+      case BinaryOperator::Or:
+      case BinaryOperator::And:
+      case BinaryOperator::Xor: {
+        auto *Op0 = BinOp->getOperand(0);
+        auto *Op1 = BinOp->getOperand(1);
+        PrevVecV[0] = Op0;
+        PrevVecV[1] = Op1;
+        break;
+      }
+      default:
+        return false;
+      }
+      ShouldBeCallOrBinInst ^= 1;
+
+      if (!isa<ShuffleVectorInst>(PrevVecV[1]))
+        std::swap(PrevVecV[0], PrevVecV[1]);
+      InstWorklist.push(PrevVecV[1]);
+      InstWorklist.push(PrevVecV[0]);
+    } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
+      if (ShouldBeCallOrBinInst ||
+          any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
+        return false;
+
+      if (SVInst != PrevVecV[1])
+        return false;
+
+      auto *ShuffleVec = SVInst->getOperand(0);
+      if (!ShuffleVec || ShuffleVec != PrevVecV[0])
+        return false;
+
+      SmallVector<int> CurMask;
+      SVInst->getShuffleMask(CurMask);
+
+      if (ShuffleMaskHalf != ExpectedShuffleMaskHalf)
+        return false;
+      ExpectedShuffleMaskHalf *= 2;
+
+      for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
+        if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask)
+          return false;
+        if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
+          return false;
+      }
+      ShuffleMaskHalf *= 2;
+      if (ExpectedShuffleMaskHalf == VecSize)
+        break;
+      ShouldBeCallOrBinInst ^= 1;
+    } else {
+      return false;
+    }
+  }
+
+  if (ShouldBeCallOrBinInst)
+    return false;
+
+  assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize &&
+         "Expected Match for Vector Size and Mask Half");
+
+  Value *FinalVecV = PrevVecV[0];
+  auto *FinalVecVTy = dyn_cast<FixedVectorType>(FinalVecV->getType());
+
+  if (!InitEEV || !FinalVecV)
+    return false;
+
+  assert(FinalVecVTy && "Expected non-null value for Vector Type");
+
+  Intrinsic::ID ReducedOp = 0;
+  if (CommonCallOp) {
+    switch (CommonCallOp) {
+    case Intrinsic::umin:
+      ReducedOp = Intrinsic::vector_reduce_umin;
+      break;
+    case Intrinsic::umax:
+      ReducedOp = Intrinsic::vector_reduce_umax;
+      break;
+    case Intrinsic::smin:
+      ReducedOp = Intrinsic::vector_reduce_smin;
+      break;
+    case Intrinsic::smax:
+      ReducedOp = Intrinsic::vector_reduce_smax;
+      break;
+    default:
+      return false;
+    }
+  } else if (CommonBinOp) {
+    switch (CommonBinOp) {
+    case BinaryOperator::Add:
+      ReducedOp = Intrinsic::vector_reduce_add;
+      break;
+    case BinaryOperator::Mul:
+      ReducedOp = Intrinsic::vector_reduce_mul;
+      break;
+    case BinaryOperator::Or:
+      ReducedOp = Intrinsic::vector_reduce_or;
+      break;
+    case BinaryOperator::And:
+      ReducedOp = Intrinsic::vector_reduce_and;
+      break;
+    case BinaryOperator::Xor:
+      ReducedOp = Intrinsic::vector_reduce_xor;
+      break;
+    default:
+      return false;
+    }
+  }
+
+  InstructionCost OrigCost = 0;
+  unsigned int NumLevels = Log2_64(VecSize);
+
+  for (unsigned int Level = 0; Level < NumLevels; ++Level) {
+    OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                   FinalVecVTy, FinalVecVTy);
+    OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy);
+  }
+  OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy,
+                                     CostKind, 0);
+
+  IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
+  InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+
+  if (NewCost >= OrigCost)
+    return false;
+
+  auto *ReducedResult =
+      Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
+  replaceValue(*InitEEV, *ReducedResult);
+
+  return true;
+}
+
 /// Determine if its more efficient to fold:
 ///   reduce(trunc(x)) -> trunc(reduce(x)).
 ///   reduce(sext(x))  -> sext(reduce(x)).
@@ -3705,6 +3940,9 @@ bool VectorCombine::run() {
         MadeChange |= foldShuffleFromReductions(I);
         MadeChange |= foldCastFromReductions(I);
         break;
+      case Instruction::ExtractElement:
+        MadeChange |= foldShuffleChainsToReduce(I);
+        break;
       case Instruction::ICmp:
       case Instruction::FCmp:
         MadeChange |= foldExtractExtract(I);

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -280,14 +280,12 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D)
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i1> [[TMP8]], [[SHIFT]]
-; CHECK-NEXT:    [[OR_COND:%.*]] = extractelement <2 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uge <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i1> [[TMP8]] to i2
+; CHECK-NEXT:    [[OR_COND:%.*]] = icmp eq i2 [[TMP9]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], splat (double 1.000000e+00)
-; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <2 x i1> [[TMP10]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP10]], [[SHIFT2]]
-; CHECK-NEXT:    [[OR_COND1_NOT:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i1> [[TMP10]] to i2
+; CHECK-NEXT:    [[OR_COND1_NOT:%.*]] = icmp ne i2 [[TMP11]], 0
 ; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND]], i1 false, i1 [[OR_COND1_NOT]]
 ; CHECK-NEXT:    ret i1 [[RETVAL_0]]
 ;

diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll
@@ -13,9 +13,7 @@ define i1 @fcmp_and_v2f64(<2 x double> %a) {
 ;
 ; AVX-LABEL: @fcmp_and_v2f64(
 ; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A:%.*]], <double 4.200000e+01, double -8.000000e+00>
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    [[R:%.*]] = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> [[TMP1]])
 ; AVX-NEXT:    ret i1 [[R]]
 ;
   %e1 = extractelement <2 x double> %a, i32 0
@@ -117,9 +115,7 @@ define i1 @fcmp_and_v2f64_multiuse(<2 x double> %a) {
 ; AVX-NEXT:    [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
 ; AVX-NEXT:    call void @use(double [[E1]])
 ; AVX-NEXT:    [[TMP1:%.*]] = fcmp olt <2 x double> [[A]], <double 4.200000e+01, double -8.000000e+00>
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
-; AVX-NEXT:    [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0
+; AVX-NEXT:    [[R:%.*]] = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> [[TMP1]])
 ; AVX-NEXT:    call void @use(i1 [[R]])
 ; AVX-NEXT:    ret i1 [[R]]
 ;