diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 19e82099e87f0..435d1cd9cbafb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -130,6 +130,7 @@ class VectorCombine { bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldShuffleChainsToReduce(Instruction &I); bool foldCastFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); @@ -2988,6 +2989,240 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return foldSelectShuffle(*Shuffle, true); } +bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { + auto *EEI = dyn_cast(&I); + if (!EEI) + return false; + + std::queue InstWorklist; + Value *InitEEV = nullptr; + + unsigned int CommonCallOp = 0, CommonBinOp = 0; + + bool IsFirstCallOrBinInst = true; + bool ShouldBeCallOrBinInst = true; + + SmallVector PrevVecV(3, nullptr); + int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1; + int64_t VecSize = -1; + + Value *VecOp; + if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero()))) + return false; + + auto *FVT = dyn_cast(VecOp->getType()); + if (!FVT) + return false; + + VecSize = FVT->getNumElements(); + if (VecSize < 2 || (VecSize % 2) != 0) + return false; + + ShuffleMaskHalf = 1; + PrevVecV[2] = VecOp; + InitEEV = EEI; + + InstWorklist.push(PrevVecV[2]); + + while (!InstWorklist.empty()) { + Value *V = InstWorklist.front(); + InstWorklist.pop(); + + auto *CI = dyn_cast(V); + if (!CI) + return false; + + if (auto *CallI = dyn_cast(CI)) { + if (!ShouldBeCallOrBinInst || !PrevVecV[2]) + return false; + + if (!IsFirstCallOrBinInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (CallI != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) + return false; + IsFirstCallOrBinInst = false; + + auto *II = dyn_cast(CallI); + if (!II) + return false; + + if (!CommonCallOp) + CommonCallOp = II->getIntrinsicID(); + if (II->getIntrinsicID() != CommonCallOp) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::smin: + case Intrinsic::smax: { + auto *Op0 = CallI->getOperand(0); + auto *Op1 = CallI->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallOrBinInst ^= 1; + + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *BinOp = dyn_cast(CI)) { + if (!ShouldBeCallOrBinInst || !PrevVecV[2]) + return false; + + if (!IsFirstCallOrBinInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) + return false; + IsFirstCallOrBinInst = false; + + if (!CommonBinOp) + CommonBinOp = CI->getOpcode(); + if (CI->getOpcode() != CommonBinOp) + return false; + + switch (CI->getOpcode()) { + case BinaryOperator::Add: + case BinaryOperator::Mul: + case BinaryOperator::Or: + case BinaryOperator::And: + case BinaryOperator::Xor: { + auto *Op0 = BinOp->getOperand(0); + auto *Op1 = BinOp->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallOrBinInst ^= 1; + + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *SVInst = dyn_cast(CI)) { + if (ShouldBeCallOrBinInst || + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (SVInst != PrevVecV[1]) + return false; + + auto *ShuffleVec = SVInst->getOperand(0); + if (!ShuffleVec || ShuffleVec != PrevVecV[0]) + return false; + + SmallVector CurMask; + SVInst->getShuffleMask(CurMask); + + if (ShuffleMaskHalf != ExpectedShuffleMaskHalf) + return false; + ExpectedShuffleMaskHalf *= 2; + + for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) { + if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask) + return false; + if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1) + return false; + } + ShuffleMaskHalf *= 2; + if (ExpectedShuffleMaskHalf == VecSize) + break; + ShouldBeCallOrBinInst ^= 1; + } else { + return false; + } + } + + if (ShouldBeCallOrBinInst) + return false; + + assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize && + "Expected Match for Vector Size and Mask Half"); + + Value *FinalVecV = PrevVecV[0]; + auto *FinalVecVTy = dyn_cast(FinalVecV->getType()); + + if (!InitEEV || !FinalVecV) + return false; + + assert(FinalVecVTy && "Expected non-null value for Vector Type"); + + Intrinsic::ID ReducedOp = 0; + if (CommonCallOp) { + switch (CommonCallOp) { + case Intrinsic::umin: + ReducedOp = Intrinsic::vector_reduce_umin; + break; + case Intrinsic::umax: + ReducedOp = Intrinsic::vector_reduce_umax; + break; + case Intrinsic::smin: + ReducedOp = Intrinsic::vector_reduce_smin; + break; + case Intrinsic::smax: + ReducedOp = Intrinsic::vector_reduce_smax; + break; + default: + return false; + } + } else if (CommonBinOp) { + switch (CommonBinOp) { + case BinaryOperator::Add: + ReducedOp = Intrinsic::vector_reduce_add; + break; + case BinaryOperator::Mul: + ReducedOp = Intrinsic::vector_reduce_mul; + break; + case BinaryOperator::Or: + ReducedOp = Intrinsic::vector_reduce_or; + break; + case BinaryOperator::And: + ReducedOp = Intrinsic::vector_reduce_and; + break; + case BinaryOperator::Xor: + ReducedOp = Intrinsic::vector_reduce_xor; + break; + default: + return false; + } + } + + InstructionCost OrigCost = 0; + unsigned int NumLevels = Log2_64(VecSize); + + for (unsigned int Level = 0; Level < NumLevels; ++Level) { + OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecVTy, FinalVecVTy); + OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy); + } + OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy, + CostKind, 0); + + IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); + InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + + if (NewCost >= OrigCost) + return false; + + auto *ReducedResult = + Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); + replaceValue(*InitEEV, *ReducedResult); + + return true; +} + /// Determine if its more efficient to fold: /// reduce(trunc(x)) -> trunc(reduce(x)). /// reduce(sext(x)) -> sext(reduce(x)). @@ -3705,6 +3940,9 @@ bool VectorCombine::run() { MadeChange |= foldShuffleFromReductions(I); MadeChange |= foldCastFromReductions(I); break; + case Instruction::ExtractElement: + MadeChange |= foldShuffleChainsToReduce(I); + break; case Instruction::ICmp: case Instruction::FCmp: MadeChange |= foldExtractExtract(I); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll index 2ec48a8637dae..20eae934019eb 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll @@ -280,14 +280,12 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) { ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = fdiv <2 x double> [[TMP4]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D) -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i1> [[TMP8]], [[SHIFT]] -; CHECK-NEXT: [[OR_COND:%.*]] = extractelement <2 x i1> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = fcmp uge <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D) +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i1> [[TMP8]] to i2 +; CHECK-NEXT: [[OR_COND:%.*]] = icmp eq i2 [[TMP9]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], splat (double 1.000000e+00) -; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <2 x i1> [[TMP10]], <2 x i1> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = or <2 x i1> [[TMP10]], [[SHIFT2]] -; CHECK-NEXT: [[OR_COND1_NOT:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <2 x i1> [[TMP10]] to i2 +; CHECK-NEXT: [[OR_COND1_NOT:%.*]] = icmp ne i2 [[TMP11]], 0 ; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND]], i1 false, i1 [[OR_COND1_NOT]] ; CHECK-NEXT: ret i1 [[RETVAL_0]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll index 6907e12158337..7bcdba6cc8904 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp-binop.ll @@ -13,9 +13,7 @@ define i1 @fcmp_and_v2f64(<2 x double> %a) { ; ; AVX-LABEL: @fcmp_and_v2f64( ; AVX-NEXT: [[TMP1:%.*]] = fcmp olt <2 x double> [[A:%.*]], -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]] -; AVX-NEXT: [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 +; AVX-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> [[TMP1]]) ; AVX-NEXT: ret i1 [[R]] ; %e1 = extractelement <2 x double> %a, i32 0 @@ -117,9 +115,7 @@ define i1 @fcmp_and_v2f64_multiuse(<2 x double> %a) { ; AVX-NEXT: [[E1:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0 ; AVX-NEXT: call void @use(double [[E1]]) ; AVX-NEXT: [[TMP1:%.*]] = fcmp olt <2 x double> [[A]], -; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <2 x i32> -; AVX-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[SHIFT]] -; AVX-NEXT: [[R:%.*]] = extractelement <2 x i1> [[TMP2]], i64 0 +; AVX-NEXT: [[R:%.*]] = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> [[TMP1]]) ; AVX-NEXT: call void @use(i1 [[R]]) ; AVX-NEXT: ret i1 [[R]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll new file mode 100644 index 0000000000000..82b20ccc5b8f5 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; +; CHECK-LABEL: define i8 @test_reduce_v16i8( +; CHECK-SAME: <16 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP8]] +; + %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> + %2 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a0, <16 x i8> %1) + %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> + %4 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %2, <16 x i8> %3) + %5 = shufflevector <16 x i8> %4, <16 x i8> poison, <16 x i32> + %6 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %4, <16 x i8> %5) + %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <16 x i32> + %8 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %6, <16 x i8> %7) + %9 = extractelement <16 x i8> %8, i64 0 + ret i8 %9 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v32i8( +; CHECK-SAME: <32 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> + %2 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a0, <32 x i8> %1) + %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> + %4 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %2, <32 x i8> %3) + %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <32 x i32> + %6 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %4, <32 x i8> %5) + %7 = shufflevector <32 x i8> %6, <32 x i8> poison, <32 x i32> + %8 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %6, <32 x i8> %7) + %9 = shufflevector <32 x i8> %8, <32 x i8> poison, <32 x i32> + %10 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %8, <32 x i8> %9) + %11 = extractelement <32 x i8> %10, i64 0 + ret i8 %11 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v16i16( +; CHECK-SAME: <16 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> + %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1) + %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> + %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3) + %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> + %6 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %4, <16 x i16> %5) + %7 = shufflevector <16 x i16> %6, <16 x i16> poison, <16 x i32> + %8 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %6, <16 x i16> %7) + %9 = extractelement <16 x i16> %8, i64 0 + ret i16 %9 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v64i8( +; CHECK-SAME: <64 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> + %2 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a0, <64 x i8> %1) + %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> + %4 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %2, <64 x i8> %3) + %5 = shufflevector <64 x i8> %4, <64 x i8> poison, <64 x i32> + %6 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %4, <64 x i8> %5) + %7 = shufflevector <64 x i8> %6, <64 x i8> poison, <64 x i32> + %8 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %6, <64 x i8> %7) + %9 = shufflevector <64 x i8> %8, <64 x i8> poison, <64 x i32> + %10 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %8, <64 x i8> %9) + %11 = shufflevector <64 x i8> %10, <64 x i8> poison, <64 x i32> + %12 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %10, <64 x i8> %11) + %13 = extractelement <64 x i8> %12, i64 0 + ret i8 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v32i16( +; CHECK-SAME: <32 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <32 x i16> %a0, <32 x i16> poison, <32 x i32> + %2 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a0, <32 x i16> %1) + %3 = shufflevector <32 x i16> %2, <32 x i16> poison, <32 x i32> + %4 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %2, <32 x i16> %3) + %5 = shufflevector <32 x i16> %4, <32 x i16> poison, <32 x i32> + %6 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %4, <32 x i16> %5) + %7 = shufflevector <32 x i16> %6, <32 x i16> poison, <32 x i32> + %8 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %6, <32 x i16> %7) + %9 = shufflevector <32 x i16> %8, <32 x i16> poison, <32 x i32> + %10 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %8, <32 x i16> %9) + %11 = extractelement <32 x i16> %10, i64 0 + ret i16 %11 +} diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll new file mode 100644 index 0000000000000..3cb25ba4ecce6 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i16 @llvm.umin.i16(i16 [[TMP13]], i16 [[TMP14]]) +; CHECK-NEXT: ret i16 [[TMP15]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + + %8 = shufflevector <8 x i16> %6, <8 x i16> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %8) + %10 = shufflevector <8 x i16> %9, <8 x i16> poison, <8 x i32> + %11 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %9, <8 x i16> %10) + %12 = shufflevector <8 x i16> %11, <8 x i16> poison, <8 x i32> + %13 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %11, <8 x i16> %12) + %14 = extractelement <8 x i16> %13, i64 0 + + %15 = tail call i16 @llvm.umin.i16(i16 %7, i16 %14) + + ret i16 %15 +} + +define i16 @test_reduce_v8i16_neg1(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg1( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg3( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i64 0 +; CHECK-NEXT: ret i16 [[TMP8]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %6 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %7 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %5, <8 x i16> %6) + %8 = extractelement <8 x i16> %7, i64 0 + ret i16 %8 +}