diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 491f0b76f4ae0..59e407cc82afc 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -70,10 +70,12 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include using namespace llvm; using namespace PatternMatch; @@ -99,6 +101,11 @@ static cl::opt cl::desc("Proceed with Loop Idiom Vectorize Pass, but do " "not convert byte-compare loop(s).")); +static cl::opt DisableMinMaxlocPattern( + "disable-loop-idiom-vectorize-minmaxloc", cl::Hidden, cl::init(false), + cl::desc("Proceed with Loop Idiom Vectorize Pass, but do " + "not convert minidx/maxidx loop(s).")); + static cl::opt ByteCmpVF("loop-idiom-vectorize-bytecmp-vf", cl::Hidden, cl::desc("The vectorization factor for byte-compare patterns."), @@ -149,6 +156,13 @@ class LoopIdiomVectorize { bool recognizeByteCompare(); + bool recognizeMinIdxPattern(); + + bool transformMinIdxPattern(unsigned VF, Value *FirstIndex, + Value *SecondIndex, BasicBlock *LoopPreheader, + Value *BasePtr, BasicBlock *Header, + BasicBlock *ExitBB, Type *LoadType); + Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen); @@ -239,9 +253,727 @@ bool LoopIdiomVectorize::run(Loop *L) { if (recognizeFindFirstByte()) return true; + if (recognizeMinIdxPattern()) + return true; + return false; } +bool LoopIdiomVectorize::recognizeMinIdxPattern() { + BasicBlock *Header = CurLoop->getHeader(), + *LoopPreheader = CurLoop->getLoopPreheader(); + Function *F = Header->getParent(); + + if (!TTI->supportsScalableVectors() || DisableMinMaxlocPattern) { + LLVM_DEBUG(dbgs() << "Does not meet pre-requisites for minidx idiom\n"); + return false; + } + + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) { + LLVM_DEBUG(dbgs() << "Loop does not match the required number of " + "have 1 back edge and 3 blocks and backedges\n"); + return false; + } + + if (Header->sizeWithoutDebug() < 14) { + LLVM_DEBUG(dbgs() << "Header block is too small for minidx pattern\n"); + return false; + } + + // Ensure that there should be exactly one predecessor of the loop header + // block. + if (LoopPreheader && !LoopPreheader->getSinglePredecessor()) { + LLVM_DEBUG(dbgs() << "Loop header should have exactly one predecessor\n"); + return false; + } + + // We need the below things to be able to transform the pattern: + // 1. Fist index. For this we look at the terminator instruction of + // the predecessor of the loop preheader. The condition of the terminator + // instruction decides whether to jump to scalar loop. + // 2. Second index. + // 3. Base pointer. + // For 2 and 3, we iterate backward from the header block to find the select + // instruction. The select instruction should be of the form select (fcmp + // contract olt loadA, loadB). Firther details below. Once we find the + // required pattern, we can extract the base pointer from the first load + // instruction + // 4. Exit basic block. For this we look at the terminator instruction of the + // header block. + + // Extract the first index from the preheader. + Value *ICmpSLTFirstVal = nullptr, *FirstIndex = nullptr; + BasicBlock *RetBB = nullptr; + BasicBlock *PreheaderPred = LoopPreheader->getSinglePredecessor(); + if (!match(PreheaderPred->getTerminator(), + m_Br(m_SpecificICmp(ICmpInst::ICMP_SLT, m_Value(ICmpSLTFirstVal), + m_ZeroInt()), + m_BasicBlock(), m_BasicBlock(RetBB)))) { + LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n"); + return false; + } + + // The Add operand can be either below: + // 1. add(sext(sub(0 - SecondIndex)), sext(FirstIndex)) + // 2. add(sext(FirstIndex), sext(sub(0 - SecondIndex))) + // This depends on whether canonicalization has been done or not. + // TODO: Handle the case where there is no sign extension and return type is i64. + if (match(ICmpSLTFirstVal, m_Add(m_SExt(m_Sub(m_ZeroInt(), m_Value())), + (m_SExt(m_Value()))))) { + FirstIndex = dyn_cast(ICmpSLTFirstVal)->getOperand(1); + } else if (match(ICmpSLTFirstVal, + m_Add(m_SExt(m_Value()), + m_SExt(m_Sub(m_ZeroInt(), m_Value()))))) { + FirstIndex = dyn_cast(ICmpSLTFirstVal)->getOperand(0); + } else { + LLVM_DEBUG(dbgs() << "Cannot extract FirstIndex from ICmpSLTFirstVal\n"); + return false; + } + + BasicBlock::reverse_iterator RI = Header->rbegin(); + SelectInst *SelectToInspect = nullptr; + Value *BasePtr = nullptr; + Instruction *Trunc = nullptr; + + // Iterate in backward direction to extract the select instruction which + // matches the pattern: + + // %load1_gep = getelementptr float, ptr %invariant.gep, i64 %indvars.iv + // %load1 = load float, ptr %load1_gep, align 4 + // %load2_gep = getelementptr float, ptr ..., ... + // %load2 = load float, ptr %load2_gep, align 4 + // %trunc = trunc nsw i64 %indvars.iv.next to i32 + // %fcmp = fcmp contract olt float %load1, %load2 + // %select = select i1 %fcmp, i32 %trunc, i32 + // %indvars.iv.next = add nsw i64 %indvars.iv, -1 + while (RI != Header->rend()) { + if (auto *Sel = dyn_cast(&*RI)) { + if (match(Sel, m_Select(m_SpecificFCmp( + FCmpInst::FCMP_OLT, + m_Load(m_GEP(m_Value(BasePtr), m_Value())), + m_Load(m_GEP(m_Value(), m_Value()))), + m_Instruction(Trunc), m_Value()))) { + SelectToInspect = Sel; + } + } + ++RI; + } + if (!SelectToInspect || !BasePtr) { + LLVM_DEBUG(dbgs() << "Select or BasePtr not found\n"); + return false; + } + + // Extract FCmp and validate load types + auto *FCmp = dyn_cast(SelectToInspect->getCondition()); + if (!FCmp || !isa(FCmp->getOperand(0)) || + !isa(FCmp->getOperand(1))) + return false; + + auto *LoadA = cast(FCmp->getOperand(0)); + auto *LoadB = cast(FCmp->getOperand(1)); + + if (LoadA->getType() != LoadB->getType()) { + LLVM_DEBUG(dbgs() << "Load types don't match\n"); + return false; + } + + // Validate truncation instruction matches expected pattern + TruncInst *TInst = dyn_cast(Trunc); + if (!TInst || TInst->getDestTy() != F->getReturnType()) { + LLVM_DEBUG(dbgs() << "Trunc instruction validation failed\n"); + return false; + } + // Trunc instruction's operand should be of the form (add IVPHI, -1). + Instruction *IVInst = nullptr; + if (!match(TInst->getOperand(0), + m_Add(m_Instruction(IVInst), m_SpecificInt(-1)))) { + LLVM_DEBUG( + dbgs() << "Trunc instruction operand doesn't match expected pattern\n"); + return false; + } + + PHINode *IVPhi = dyn_cast(IVInst); + if (!IVPhi) { + LLVM_DEBUG(dbgs() << "Add operand of trunc instruction is not a PHINode\n"); + return false; + } + + Value *SecondIndex = IVPhi->getIncomingValueForBlock(LoopPreheader); + LLVM_DEBUG(dbgs() << "SecondIndex is " << *SecondIndex << "\n"); + + // 4. Inspect Terminator to extract the exit block. + // Example LLVM IR to inspect: + // %20 = icmp sgt i64 %13, 1 + // br i1 %20, label %.lr.ph, label %._crit_edge.loopexit + Value *ICmpFirstVal = nullptr; + BasicBlock *FalseBB = nullptr; + BranchInst *Terminator = dyn_cast(Header->getTerminator()); + if (!match(Terminator, m_Br(m_SpecificICmp(ICmpInst::ICMP_SGT, + m_Value(ICmpFirstVal), m_One()), + m_BasicBlock(Header), m_BasicBlock(FalseBB)))) { + LLVM_DEBUG(dbgs() << "Terminator doesn't match expected pattern\n"); + return false; + } + + // TODO: Handle other vector widths. + unsigned VF = 128 / LoadA->getType()->getPrimitiveSizeInBits(); + + // We've recognized the pattern, now transform it. + LLVM_DEBUG(dbgs() << "FOUND MINIDX PATTERN\n"); + + return transformMinIdxPattern(VF, FirstIndex, SecondIndex, LoopPreheader, + BasePtr, Header, FalseBB, LoadA->getType()); +} + +bool LoopIdiomVectorize::transformMinIdxPattern( + unsigned VF, Value *FirstIndex, Value *SecondIndex, + BasicBlock *LoopPreheader, Value *BasePtr, BasicBlock *Header, + BasicBlock *ExitBB, Type *LoadType) { + + LLVMContext &Ctx = Header->getContext(); + Function *F = Header->getParent(); + Module *M = F->getParent(); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + Type *I32Ty = Type::getInt32Ty(Ctx); + Type *I64Ty = Type::getInt64Ty(Ctx); + Type *I1Ty = Type::getInt1Ty(Ctx); + Type *PointerType = PointerType::get(Ctx, 0); + auto *MaskTy = ScalableVectorType::get(Type::getInt1Ty(Ctx), 4); + auto *VecTy = ScalableVectorType::get( + LoadType, VF); // This is the vector type for i32 values + + + // High-level overview of the transformation: + // We divide the process in three phases: + // In the first phase, we process a chunk which is not multiple of VF. + // We do this by rounding down the `SecondIndex` to the nearest multiple of VF. + // The minimum value and the index of the minimum value are computed for this chunk. + // In the second phase, we process all chunks which are multiple of VF. + // In the third phase, we process the last chunk which is not multiple of VF. + // The third phase is required because the FirstIndex is necessary to start from zero + // thus we take max(0, FirstIndex) as the starting index. + + // Overview of the algorithm to compute minindex within a chunk: + // 1. We compare the current loaded vector against a splat of infinity. + // 2. Further, we set the bits until we find the first set bit in the output of the + // above comparison. This is realized using the `cttz` intrinsic. + // 3. Next, we count the number of bits set and this gives us the offset from the + // base. The base of the chunk is updated in each phase. + // Step 1 and 2 are done using brkb + cnt which is realized using the `cttz` intrinsic. + + // The below basic blocks are used to process the first phase + // and are for processing the chunk which is not multiple of VF. + BasicBlock *VecEntry = BasicBlock::Create(Ctx, "minidx.vec.entry", F); + BasicBlock *MinIdxPartial1If = + BasicBlock::Create(Ctx, "minidx.partial.1.if", F); + BasicBlock *MinIdxPartial1ProcExit = + BasicBlock::Create(Ctx, "minidx.partial.1.proc.exit", F); + + // The below basic blocks are used to process the second phase + // and are for processing the chunks which are multiple of VF. + BasicBlock *MinIdxWhileBodyLrPh = + BasicBlock::Create(Ctx, "minidx.while.body.ph", F); + BasicBlock *MinIdxVectBody = BasicBlock::Create(Ctx, "minidx.vect.body", F); + BasicBlock *MinIdxVectUpdate = + BasicBlock::Create(Ctx, "minidx.vect.update", F); + BasicBlock *MinIdxVectContinue = + BasicBlock::Create(Ctx, "minidx.vect.continue", F); + BasicBlock *MinIdxVectEnd = BasicBlock::Create(Ctx, "minidx.vect.end", F); + + // The below basic blocks are used to process the third phase + // and are for processing the last chunk which is not multiple of VF. + BasicBlock *MinIdxPartial2If = + BasicBlock::Create(Ctx, "minidx.partial.2.if", F); + BasicBlock *MinIdxPartial2Exit = + BasicBlock::Create(Ctx, "minidx.partial.2.exit", F); + BasicBlock *MinIdxEnd = BasicBlock::Create(Ctx, "minidx.end", F); + + Loop *VecLoop = LI->AllocateLoop(); + VecLoop->addBasicBlockToLoop(MinIdxVectBody, *LI); + VecLoop->addBasicBlockToLoop(MinIdxVectUpdate, *LI); + VecLoop->addBasicBlockToLoop(MinIdxVectContinue, *LI); + + LI->addTopLevelLoop(VecLoop); + + // Start populating preheader. + IRBuilder<> Builder(LoopPreheader->getTerminator()); + // %VScale = tail call i64 @llvm.vscale.i64() + // %VLen = shl nuw nsw i64 %VScale, 2 + // %minidx.not = sub nsw i64 0, %VLen + // %minidx.and = and i64 %SecondIndex, %minidx.not + Value *GMax = Builder.CreateVectorSplat(ElementCount::getScalable(VF), + ConstantFP::getInfinity(LoadType, 0), + "minidx.gmax"); + Value *VScale = Builder.CreateVScale(I64Ty); + Value *VLen = + Builder.CreateShl(VScale, ConstantInt::get(I64Ty, 2), "minidx.vlen"); + Value *Not = + Builder.CreateSub(ConstantInt::get(I64Ty, 0), VLen, "minidx.not"); + Value *And = Builder.CreateAnd(SecondIndex, Not, "minidx.and"); + + // %minidx.umax = tail call i64 @llvm.umax.i64(i64 %minidx.and, i64 %FirstIndex) + // %minidx.add = add i64 %SecondIndex, 1 + Value *Umax = Builder.CreateIntrinsic( + Intrinsic::smax, {I64Ty}, {And, FirstIndex}, nullptr, "minidx.umax"); + Value *Add = + Builder.CreateAdd(SecondIndex, ConstantInt::get(I64Ty, 1), "minidx.add"); + // %minidx.mask = call + // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %minidx.umax, i64 %minidx.add) + Value *MinIdxMask = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask, + {MaskTy, I64Ty}), + {Umax, Add}, "minidx.mask"); + + // %minidx.add.ptr.i = getelementptr inbounds nuw float, ptr %p, i64 + // %minidx.umax %minidx.masked.load = tail call + // @llvm.masked.load.nxv4f32.p0(ptr %minidx.add.ptr.i, i32 1, %minidx.mask, zeroinitializer) %minidx.currentVals + // = select %minidx.mask, + // %minidx.masked.load, splat (float 0x7FF0000000000000) + // %minidx.reverse = tail call + // @llvm.vector.reverse.nxv4i1( %minidx.mask) + // %minidx.reverseVals = tail call + // @llvm.vector.reverse.nxv4f32( %minidx.currentVals) + // %minidx.minVal = call float @llvm.vector.reduce.fminimum.nxv4f32( %minidx.reverseVals) + + Value *UmaxMinus1 = + Builder.CreateSub(Umax, ConstantInt::get(I64Ty, 1), "minidx.umax.minus1"); + Value *AddPtrI = Builder.CreateInBoundsGEP(LoadType, BasePtr, UmaxMinus1, + "minidx.add.ptr.i"); + + Value *LoadVals = + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + M, Intrinsic::masked_load, {VecTy, PointerType}), + {AddPtrI, ConstantInt::get(I32Ty, 1), MinIdxMask, + Constant::getNullValue(VecTy)}, + "minidx.loadVals"); + Value *CurrentVals = + Builder.CreateSelect(MinIdxMask, LoadVals, GMax, "minidx.currentVals"); + Value *Reverse = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}), + {MinIdxMask}, "minidx.reverse"); + Value *ReverseVals = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}), + {CurrentVals}, "minidx.reverseVals"); + Value *MinVal = + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reduce_fminimum, {VecTy}), + {ReverseVals}, "minidx.minVal"); + + Builder.CreateCondBr(Builder.getTrue(), VecEntry, Header); + LoopPreheader->getTerminator()->eraseFromParent(); + + // Add edge from preheader to VecEntry + DTU.applyUpdates({{DominatorTree::Insert, LoopPreheader, VecEntry}}); + + // %minidx.entry.cmp = fcmp olt float %minidx.minVal, %init + // br i1 %minidx.entry.cmp, label %minidx.partial.1.if, label + // %minidx.partial.1.proc.exit + Builder.SetInsertPoint(VecEntry); + Value *VecEntryCmp = Builder.CreateFCmpOLT( + MinVal, ConstantFP::getInfinity(LoadType, 0), "minidx.entry.cmp"); + Builder.CreateCondBr(VecEntryCmp, MinIdxPartial1If, MinIdxPartial1ProcExit); + + // Connect edges from VecEntry to MinIdxPartial1If and MinIdxPartial1ProcExit + DTU.applyUpdates({{DominatorTree::Insert, VecEntry, MinIdxPartial1If}, + {DominatorTree::Insert, VecEntry, MinIdxPartial1ProcExit}}); + + Builder.SetInsertPoint(MinIdxPartial1If); + // %minVal.splatinsert = insertelement poison, float + // %minidx.minVal, i64 0 %minVal.splat = shufflevector + // %minVal.splatinsert, poison, + // zeroinitializer + Value *MinValSplat = Builder.CreateVectorSplat(ElementCount::getScalable(VF), + MinVal, "minval.splat"); + // %minidx.partial.1.cmp = fcmp oeq %minidx.reverseVals, + // %minVal.splat %minidx.partial.1.and = and + // %minidx.reverse, %minidx.partial.1.cmp %minidx.partial.1.cttz = tail call + // i64 @llvm.experimental.cttz.elts.i64.nxv4i1( + // %minidx.partial.1.and, i1 true) + Value *FirstPartialCmp = + Builder.CreateFCmpOEQ(ReverseVals, MinValSplat, "minidx.partial.1.cmp"); + Value *FirstPartialAnd = + Builder.CreateAnd(Reverse, FirstPartialCmp, "minidx.partial.1.and"); + Value *FirstPartialCTTZ = Builder.CreateCountTrailingZeroElems( + I64Ty, FirstPartialAnd, ConstantInt::get(I1Ty, 1), + "minidx.partial.1.cttz"); + + // %minidx.partial.1.tmp = sub i64 vlen, %minidx.partial.1.cttz + // %minidx.partial.1.tmp.minus1 = sub i64 %minidx.partial.1.tmp, 1 + // %minidx.partial.1.add2 = add i64 %minidx.umax, %minidx.partial.1.tmp.minus1 + Value *FirstPartialTmp1 = + Builder.CreateSub(VLen, FirstPartialCTTZ, "minidx.partial.1.tmp"); + Value *FirstPartialTmp = + Builder.CreateSub(FirstPartialTmp1, ConstantInt::get(I64Ty, 1), + "minidx.partial.1.tmp.minus1"); + Value *FirstPartialAdd2 = + Builder.CreateAdd(Umax, FirstPartialTmp, "minidx.partial.1.add2"); + + Builder.CreateBr(MinIdxPartial1ProcExit); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxPartial1If, MinIdxPartial1ProcExit}}); + + Builder.SetInsertPoint(MinIdxPartial1ProcExit); + // %minidx.partial.1.exit.known_min = phi float [ %minidx.minVal, + // %minidx.partial.1.if ], [ %init, %entry ] %partial1.exit.known_arg = phi + // i64 [ %minidx.partial.1.add2, %minidx.partial.1.if ], [ 0, %entry ] + PHINode *Partial1ExitKnownMin = + Builder.CreatePHI(LoadType, 2, "minidx.partial.1.exit.known_min"); + PHINode *Partial1ExitKnownArg = + Builder.CreatePHI(I64Ty, 2, "partial1.exit.known_arg"); + + Partial1ExitKnownMin->addIncoming(MinVal, MinIdxPartial1If); + Partial1ExitKnownMin->addIncoming(ConstantFP::getInfinity(LoadType, 0), + VecEntry); + Partial1ExitKnownArg->addIncoming(FirstPartialAdd2, MinIdxPartial1If); + Partial1ExitKnownArg->addIncoming(ConstantInt::get(I64Ty, 0), VecEntry); + + // %minidx.partial.1.proc.exit.add = add i64 %VLen, %FirstIndex + // %minidx.partial.1.proc.exit.icmp = icmp ult i64 %minidx.umax, + // %minidx.partial.1.proc.exit.add br i1 %minidx.partial.1.proc.exit.icmp, + // label %minidx.vect.end, label %minidx.while.body.ph + Value *MinIdxPartial1ProcExitAdd = + Builder.CreateAdd(VLen, FirstIndex, "minidx.partial.1.proc.exit.add"); + Value *MinIdxPartial1ProcExitICmp = Builder.CreateICmpULT( + Umax, MinIdxPartial1ProcExitAdd, "minidx.partial.1.proc.exit.icmp"); + Builder.CreateCondBr(MinIdxPartial1ProcExitICmp, MinIdxVectEnd, + MinIdxWhileBodyLrPh); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxVectEnd}, + {DominatorTree::Insert, MinIdxPartial1ProcExit, MinIdxWhileBodyLrPh}}); + + Builder.SetInsertPoint(MinIdxWhileBodyLrPh); + // %minidx.while.body.ph.mul = mul nsw i64 %VScale, -16 + // %minidx.while.body.ph.gep = getelementptr i8, ptr %p, i64 + // %minidx.while.body.ph.mul br label %minidx.vect.body + Builder.CreateBr(MinIdxVectBody); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxWhileBodyLrPh, MinIdxVectBody}}); + + Builder.SetInsertPoint(MinIdxVectBody); + // %minidx.vect.body.phi1 = phi i64 [ %minidx.umax, %minidx.while.body.ph ], [ + // %minidx.vect.body.sub, %minidx.vect.continue ] %minidx.vect.body.known_arg + // = phi i64 [ %partial1.exit.known_arg, %minidx.while.body.ph ], [ + // %minidx.vect.continue.known_arg, %minidx.vect.continue ] + // %minidx.vect.body.known_min = phi float [ %minidx.partial.1.exit.known_min, + // %minidx.while.body.ph ], [ %minidx.vect.continue.known_min, + // %minidx.vect.continue ] + PHINode *MinIdxVectBodyPhi1 = + Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.phi1"); + PHINode *MinIdxVectBodyKnownArg = + Builder.CreatePHI(I64Ty, 2, "minidx.vect.body.known_arg"); + PHINode *MinIdxVectBodyKnownMin = + Builder.CreatePHI(LoadType, 2, "minidx.vect.body.known_min"); + + // %minidx.vect.body.sub = sub i64 %minidx.vect.body.phi1, %VLen + // %minidx.vect.body.shl = shl i64 %minidx.vect.body.phi1, 2 + // %minidx.vect.body.gep = getelementptr i8, ptr %minidx.while.body.ph.gep, + // i64 %minidx.vect.body.shl + Value *MinIdxVectBodySub = + Builder.CreateSub(MinIdxVectBodyPhi1, VLen, "minidx.vect.body.sub"); + Value *MinIdxVectBodyShl = + Builder.CreateSub(MinIdxVectBodySub, ConstantInt::get(I64Ty, 1), + "minidx.vect.body.sub.minus1"); + Value *MinIdxVectBodyGEP = Builder.CreateInBoundsGEP( + LoadType, BasePtr, MinIdxVectBodyShl, "minidx.vect.body.gep"); + + // %minidx.vect.body.unmaskedload = load , ptr + // %minidx.vect.body.gep, align 1 %minidx.vect.body.load.rev = tail call + // @llvm.vector.reverse.nxv4f32( + // %minidx.vect.body.unmaskedload) %minidx.vect.body.load.reduce = tail call + // float @llvm.vector.reduce.fminimum.nxv4f32( + // %minidx.vect.body.load.rev) + Value *MinIdxVectBodyUnmaskedLoad = Builder.CreateLoad( + VecTy, MinIdxVectBodyGEP, "minidx.vect.body.unmaskedload"); + Value *MinIdxVectBodyReverse = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}), + {MinIdxVectBodyUnmaskedLoad}, "minidx.vect.body.reverse"); + Value *MinIdxVectBodyReduce = + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reduce_fminimum, {VecTy}), + {MinIdxVectBodyReverse}, "minidx.vect.body.reduce"); + + // %minidx.vect.body.fcmp = fcmp olt float %minidx.vect.body.load.reduce, + // %minidx.vect.body.known_min br i1 %minidx.vect.body.fcmp, label + // %minidx.vect.update, label %minidx.vect.continue + Value *MinIdxVectBodyFCmp = Builder.CreateFCmpOLT( + MinIdxVectBodyReduce, MinIdxVectBodyKnownMin, "minidx.vect.body.fcmp"); + + Builder.CreateCondBr(MinIdxVectBodyFCmp, MinIdxVectUpdate, + MinIdxVectContinue); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxVectBody, MinIdxVectUpdate}, + {DominatorTree::Insert, MinIdxVectBody, MinIdxVectContinue}}); + + Builder.SetInsertPoint(MinIdxVectUpdate); + // %minidx.vect.update.splatinsert = insertelement + // poison, float %minidx.vect.body.load.reduce, i64 0 + // %minidx.vect.update.splat = shufflevector + // %minidx.vect.update.splatinsert, poison, zeroinitializer %minidx.vect.update.fcmp = fcmp ueq %minidx.vect.body.load.rev, %minidx.vect.update.splat + Value *MinIdxVectUpdateSplat = Builder.CreateVectorSplat( + ElementCount::getScalable(VF), MinIdxVectBodyReduce, + "minidx.vect.update.splatinsert"); + Value *MinIdxVectUpdateFCmp = Builder.CreateFCmpUEQ( + MinIdxVectBodyReverse, MinIdxVectUpdateSplat, "minidx.vect.update.fcmp"); + + // %minidx.vect.update.cttz = call i64 + // @llvm.experimental.cttz.elts.i64.nxv4i1( + // %minidx.vect.update.fcmp, i1 true) %minidx.vect.update.mul = mul i64 + // %minidx.vect.update.cttz, -1 %minidx.vect.update.add = add i64 + // %minidx.vect.body.phi1, %minidx.vect.update.mul + Value *MinIdxVectUpdateCTTZ = Builder.CreateCountTrailingZeroElems( + I64Ty, MinIdxVectUpdateFCmp, ConstantInt::get(I1Ty, 1), + "minidx.vect.update.cttz"); + Value *MinIdxVectUpdateMul = + Builder.CreateMul(MinIdxVectUpdateCTTZ, ConstantInt::get(I64Ty, -1), + "minidx.vect.update.mul"); + Value *MinIdxVectUpdateAdd = Builder.CreateAdd( + MinIdxVectBodyPhi1, MinIdxVectUpdateMul, "minidx.vect.update.add"); + + // %minidx.vect.body.add2 = add i64 %minidx.vect.update.add, -1 + // br label %minidx.vect.continue + Value *MinIdxVectBodyAdd2 = + Builder.CreateAdd(MinIdxVectUpdateAdd, ConstantInt::get(I64Ty, -1), + "minidx.vect.body.add2"); + Builder.CreateBr(MinIdxVectContinue); + + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxVectUpdate, MinIdxVectContinue}}); + + Builder.SetInsertPoint(MinIdxVectContinue); + // %minidx.vect.continue.known_min = phi float [ + // %minidx.vect.body.load.reduce, %minidx.vect.update ], [ + // %minidx.vect.body.known_min, %minidx.vect.body ] + // %minidx.vect.continue.known_arg = phi i64 [ %minidx.vect.body.add2, + // %minidx.vect.update ], [ %minidx.vect.body.known_arg, %minidx.vect.body ] + // %minidx.vect.continue.icmp = icmp ult i64 %minidx.vect.body.sub, + // %minidx.partial.1.proc.exit.add + PHINode *MinIdxVectContinueKnownMin = + Builder.CreatePHI(LoadType, 2, "minidx.vect.continue.known_min"); + PHINode *MinIdxVectContinueKnownArg = + Builder.CreatePHI(I64Ty, 2, "minidx.vect.continue.known_arg"); + + MinIdxVectContinueKnownMin->addIncoming(MinIdxVectBodyReduce, + MinIdxVectUpdate); + MinIdxVectContinueKnownMin->addIncoming(MinIdxVectBodyKnownMin, + MinIdxVectBody); + MinIdxVectContinueKnownArg->addIncoming(MinIdxVectBodyAdd2, MinIdxVectUpdate); + MinIdxVectContinueKnownArg->addIncoming(MinIdxVectBodyKnownArg, + MinIdxVectBody); + + // br i1 %minidx.vect.continue.icmp, label %minidx.vect.end, label + // %minidx.vect.body + Value *MinIdxVectContinueICmp = + Builder.CreateICmpULT(MinIdxVectBodySub, MinIdxPartial1ProcExitAdd, + "minidx.vect.continue.icmp"); + + Builder.CreateCondBr(MinIdxVectContinueICmp, MinIdxVectEnd, MinIdxVectBody); + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxVectContinue, MinIdxVectEnd}, + {DominatorTree::Insert, MinIdxVectContinue, MinIdxVectBody}}); + + Builder.SetInsertPoint(MinIdxVectEnd); + // %minidx.vect.end.known_min.lcssa = phi float [ + // %minidx.partial.1.exit.known_min, %minidx.partial.1.proc.exit ], [ + // %minidx.vect.continue.known_min, %minidx.vect.continue ] + // %minidx.vect.end.known_arg.lcssa = phi i64 [ %partial1.exit.known_arg, + // %minidx.partial.1.proc.exit ], [ %known_arg.3, %minidx.vect.continue ] + // %minidx.vect.end.lcssa = phi i64 [ %minidx.umax, + // %minidx.partial.1.proc.exit ], [ %minidx.vect.body.sub, + // %minidx.vect.continue ] + PHINode *MinIdxVectEndKnownMin = + Builder.CreatePHI(LoadType, 2, "minidx.vect.end.known_min.lcssa"); + PHINode *MinIdxVectEndKnownArg = + Builder.CreatePHI(I64Ty, 2, "minidx.vect.end.known_arg.lcssa"); + PHINode *MinIdxVectEndLCSSA = + Builder.CreatePHI(I64Ty, 2, "minidx.vect.end.lcssa"); + + MinIdxVectEndKnownMin->addIncoming(Partial1ExitKnownMin, + MinIdxPartial1ProcExit); + MinIdxVectEndKnownMin->addIncoming(MinIdxVectContinueKnownMin, + MinIdxVectContinue); + MinIdxVectEndKnownArg->addIncoming(Partial1ExitKnownArg, + MinIdxPartial1ProcExit); + MinIdxVectEndKnownArg->addIncoming(MinIdxVectContinueKnownArg, + MinIdxVectContinue); + MinIdxVectEndLCSSA->addIncoming(Umax, MinIdxPartial1ProcExit); + MinIdxVectEndLCSSA->addIncoming(MinIdxVectBodySub, MinIdxVectContinue); + + // %minidx.vect.end.icmp = icmp ugt i64 %minidx.vect.end.lcssa, %FirstIndex + // br i1 %minidx.vect.end.icmp, label %minidx.partial.2.if, label %minidx.end + + Value *MinIdxVectEndCmp = Builder.CreateICmpUGT( + MinIdxVectEndLCSSA, FirstIndex, "minidx.vect.end.cmp"); + Builder.CreateCondBr(MinIdxVectEndCmp, MinIdxPartial2If, MinIdxEnd); + DTU.applyUpdates({{DominatorTree::Insert, MinIdxVectEnd, MinIdxPartial2If}, + {DominatorTree::Insert, MinIdxVectEnd, MinIdxEnd}}); + + Builder.SetInsertPoint(MinIdxPartial2If); + // %minidx.partial.2.if.add = add nuw i64 %minidx.vect.end.lcssa, 1 + // %minidx.partial.2.if.mask = call + // @llvm.get.active.lane.mask.nxv4i1.i64(i64 %FirstIndex, i64 + // %minidx.partial.2.if.add) %minidx.partial.2.if.gep = getelementptr inbounds + // nuw float, ptr %p, i64 %FirstIndex + Value *MinIdxPartial2IfAdd = + Builder.CreateAdd(MinIdxVectEndLCSSA, ConstantInt::get(I64Ty, 0), + "minidx.partial.2.if.add.zero"); + Value *MinIdxPartial2IfMask = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::get_active_lane_mask, + {MaskTy, I64Ty}), + {FirstIndex, MinIdxPartial2IfAdd}, "minidx.partial.2.if.mask"); + + // Reverse the mask. + MinIdxPartial2IfMask = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {MaskTy}), + {MinIdxPartial2IfMask}, "minidx.partial.2.if.mask.reverse"); + + Value *FirstIndexMinus1 = + Builder.CreateSub(FirstIndex, ConstantInt::get(I64Ty, 1), + "minidx.partial.2.if.firstindex.minus1"); + Value *MinIdxPartial2IfGEP = Builder.CreateInBoundsGEP( + LoadType, BasePtr, FirstIndexMinus1, "minidx.partial.2.if.gep"); + + // %minidx.partial.2.if.load = tail call + // @llvm.masked.load.nxv4f32.p0(ptr %minidx.partial.2.if.gep, i32 1, %minidx.partial.2.if.mask, zeroinitializer) + // %minidx.partial.2.if.rev = tail call + // @llvm.vector.reverse.nxv4f32( + // %minidx.partial.2.if.load) %minidx.partial.2.if.reduce = tail call float + // @llvm.vector.reduce.fminimum.nxv4f32( + // %minidx.partial.2.if.rev) + Value *MinIdxPartial2IfLoad = + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + M, Intrinsic::masked_load, {VecTy, PointerType}), + {MinIdxPartial2IfGEP, ConstantInt::get(I32Ty, 1), + MinIdxPartial2IfMask, Constant::getNullValue(VecTy)}, + "minidx.partial.2.if.load"); + Value *MinIdxPartial2IfReverse = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, {VecTy}), + {MinIdxPartial2IfLoad}, "minidx.partial.2.if.reverse"); + Value *MinIdxPartial2IfReduce = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reduce_fminimum, + {VecTy}), + {MinIdxPartial2IfReverse}, "minidx.partial.2.if.reduce"); + + // %minidx.partial.2.if.fcmp = fcmp olt float %minidx.partial.2.if.reduce, + // %minidx.vect.end.known_min.lcssa br i1 %minidx.partial.2.if.fcmp, label + // %minidx.partial.2.exit, label %minidx.end + Value *MinIdxPartial2IfFCmp = + Builder.CreateFCmpOLT(MinIdxPartial2IfReduce, MinIdxVectEndKnownMin, + "minidx.partial.2.if.fcmp"); + Builder.CreateCondBr(MinIdxPartial2IfFCmp, MinIdxPartial2Exit, MinIdxEnd); + DTU.applyUpdates( + {{DominatorTree::Insert, MinIdxPartial2If, MinIdxPartial2Exit}, + {DominatorTree::Insert, MinIdxPartial2If, MinIdxEnd}}); + + Builder.SetInsertPoint(MinIdxPartial2Exit); + // %minidx.partial.2.exit.splatinsert = insertelement + // poison, float %minidx.partial.2.if.reduce, i64 0 + // %minidx.partial.2.exit.splat = shufflevector + // %minidx.partial.2.exit.splatinsert, poison, zeroinitializer %minidx.partial.2.exit.fcmp = fcmp oeq %minidx.partial.2.if.rev, %minidx.partial.2.exit.splat + Value *MinIdxPartial2ExitSplat = Builder.CreateVectorSplat( + ElementCount::getScalable(VF), MinIdxPartial2IfReduce, + "minidx.partial.2.exit.splatinsert"); + Value *MinIdxPartial2ExitFCmp = + Builder.CreateFCmpOEQ(MinIdxPartial2IfReverse, MinIdxPartial2ExitSplat, + "minidx.partial.2.exit.fcmp"); + + // %minidx.partial.2.exit.and = and + // %minidx.partial.2.exit.fcmp, %minidx.partial.2.if.mask + // %minidx.partial.2.exit.cttz = call i64 + // @llvm.experimental.cttz.elts.i64.nxv4i1( + // %minidx.partial.2.exit.and, i1 true) + Value *MinIdxPartial2ExitAnd = + Builder.CreateAnd(MinIdxPartial2ExitFCmp, MinIdxPartial2IfMask, + "minidx.partial.2.exit.and"); + Value *MinIdxPartial2ExitCTTZ = Builder.CreateCountTrailingZeroElems( + I64Ty, MinIdxPartial2ExitAnd, ConstantInt::get(I1Ty, 1), + "minidx.partial.2.exit.cttz"); + + Value *MinIdxPartial2ExitTmp1 = Builder.CreateSub( + VLen, MinIdxPartial2ExitCTTZ, "minidx.partial.2.exit.tmp"); + Value *MinIdxPartial2ExitTmp = + Builder.CreateSub(MinIdxPartial2ExitTmp1, ConstantInt::get(I64Ty, 1), + "minidx.partial.2.exit.tmp.minus1"); + Value *MinIdxPartial2ExitAdd = Builder.CreateAdd( + FirstIndex, MinIdxPartial2ExitTmp, "minidx.partial.2.exit.add2"); + + // %minidx.partial.2.exit.xor = xor i64 %minidx.partial.2.exit.cttz, -1 + // %minidx.partial.2.add = add i64 %minidx.partial.1.proc.exit.add, + // %minidx.partial.2.exit.xor br label %minidx.end + Builder.CreateBr(MinIdxEnd); + + DTU.applyUpdates({{DominatorTree::Insert, MinIdxPartial2Exit, MinIdxEnd}}); + + Builder.SetInsertPoint(MinIdxEnd); + // %minidx.ret = phi i64 [ %minidx.vect.end.known_arg.lcssa, %minidx.vect.end + // ], [ %minidx.partial.2.add, %minidx.partial.2.exit ], [ + // %minidx.vect.end.known_arg.lcssa, %minidx.partial.2.if ] br label %ExitBB + PHINode *MinIdxRet = Builder.CreatePHI(I64Ty, 3, "minidx.ret"); + MinIdxRet->addIncoming(MinIdxVectEndKnownArg, MinIdxVectEnd); + MinIdxRet->addIncoming(MinIdxPartial2ExitAdd, MinIdxPartial2Exit); + MinIdxRet->addIncoming(MinIdxVectEndKnownArg, MinIdxPartial2If); + + // create bitcast. + Value *MinIdxRetBitCast = Builder.CreateTruncOrBitCast( + MinIdxRet, F->getReturnType(), "minidx.ret.bitcast"); + + Builder.CreateBr(ExitBB); + DTU.applyUpdates({{DominatorTree::Insert, MinIdxEnd, ExitBB}}); + + MinIdxVectBodyPhi1->addIncoming(Umax, MinIdxWhileBodyLrPh); + MinIdxVectBodyPhi1->addIncoming(MinIdxVectBodySub, MinIdxVectContinue); + + MinIdxVectBodyKnownArg->addIncoming(Partial1ExitKnownArg, + MinIdxWhileBodyLrPh); + MinIdxVectBodyKnownArg->addIncoming(MinIdxVectContinueKnownArg, + MinIdxVectContinue); + + MinIdxVectBodyKnownMin->addIncoming(Partial1ExitKnownMin, + MinIdxWhileBodyLrPh); + MinIdxVectBodyKnownMin->addIncoming(MinIdxVectContinueKnownMin, + MinIdxVectContinue); + + // Collect PHIs that need to be replaced + SmallVector PHIsToReplace; + for (PHINode &PHI : ExitBB->phis()) { + PHIsToReplace.push_back(&PHI); + } + + // Now perform the replacement + for (PHINode *PHI : PHIsToReplace) { + // Create PHI at the beginning of the block + Builder.SetInsertPoint(ExitBB, ExitBB->getFirstInsertionPt()); + PHINode *ExitPHI = + Builder.CreatePHI(F->getReturnType(), PHI->getNumIncomingValues() + 1); + for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { + ExitPHI->addIncoming(PHI->getIncomingValue(I), PHI->getIncomingBlock(I)); + } + ExitPHI->addIncoming(MinIdxRetBitCast, MinIdxEnd); + // Replace all uses of PHI with ExitPHI. + PHI->replaceAllUsesWith(ExitPHI); + PHI->eraseFromParent(); + } + + VecLoop->verifyLoop(); + if (!VecLoop->isRecursivelyLCSSAForm(*DT, *LI)) { + LLVM_DEBUG(dbgs() << "Loop is not in LCSSA form\n"); + VecLoop->print(dbgs()); + VecLoop->dump(); + } + + return true; +} + bool LoopIdiomVectorize::recognizeByteCompare() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed diff --git a/llvm/test/Transforms/LoopIdiom/last-min-index-ftn.ll b/llvm/test/Transforms/LoopIdiom/last-min-index-ftn.ll new file mode 100644 index 0000000000000..013bb45b44e2c --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/last-min-index-ftn.ll @@ -0,0 +1,291 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW1-IL4 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL1 +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s --check-prefix=CHECK-REV-MIN-VW4-IL2 +; RUN: opt -passes=loop-idiom-vectorize -S -mtriple=aarch64 -mattr=+sve %s | FileCheck %s --check-prefix=CHECK-LOOP-IDIOM + +; This test case is extracted from rnflow (fortran) benchmark in polyhedron benchmark suite. +; The function minlst primarily takes two indices (i.e. range), scans backwards in the range +; and returns the firstIV of the minimum value. + +define i32 @minlst(i32 %first_index, i32 %last_index, ptr %array) { +; CHECK-REV-MIN-VW1-IL4-LABEL: define i32 @minlst( +; CHECK-REV-MIN-VW1-IL4-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) { +; CHECK-REV-MIN-VW1-IL4-NEXT: [[ENTRY:.*]]: +; CHECK-REV-MIN-VW1-IL4-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0 +; CHECK-REV-MIN-VW1-IL4-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]] +; CHECK-REV-MIN-VW1-IL4: [[LOOP_PREHEADER]]: +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64 +; CHECK-REV-MIN-VW1-IL4-NEXT: br label %[[LOOP:.*]] +; CHECK-REV-MIN-VW1-IL4: [[LOOP]]: +; CHECK-REV-MIN-VW1-IL4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1 +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1 +; CHECK-REV-MIN-VW1-IL4-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]] +; CHECK-REV-MIN-VW1-IL4: [[__CRIT_EDGE_LOOPEXIT:.*:]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ] +; CHECK-REV-MIN-VW1-IL4-NEXT: br [[DOT_CRIT_EDGE]] +; CHECK-REV-MIN-VW1-IL4: [[__CRIT_EDGE:.*:]] +; CHECK-REV-MIN-VW1-IL4-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[SELECT_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ] +; CHECK-REV-MIN-VW1-IL4-NEXT: ret i32 [[LAST_INDEX_RET]] +; +; CHECK-REV-MIN-VW4-IL1-LABEL: define i32 @minlst( +; CHECK-REV-MIN-VW4-IL1-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) { +; CHECK-REV-MIN-VW4-IL1-NEXT: [[ENTRY:.*]]: +; CHECK-REV-MIN-VW4-IL1-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0 +; CHECK-REV-MIN-VW4-IL1-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]] +; CHECK-REV-MIN-VW4-IL1: [[LOOP_PREHEADER]]: +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64 +; CHECK-REV-MIN-VW4-IL1-NEXT: br label %[[LOOP:.*]] +; CHECK-REV-MIN-VW4-IL1: [[LOOP]]: +; CHECK-REV-MIN-VW4-IL1-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1 +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1 +; CHECK-REV-MIN-VW4-IL1-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]] +; CHECK-REV-MIN-VW4-IL1: [[__CRIT_EDGE_LOOPEXIT:.*:]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ] +; CHECK-REV-MIN-VW4-IL1-NEXT: br [[DOT_CRIT_EDGE]] +; CHECK-REV-MIN-VW4-IL1: [[__CRIT_EDGE:.*:]] +; CHECK-REV-MIN-VW4-IL1-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[SELECT_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ] +; CHECK-REV-MIN-VW4-IL1-NEXT: ret i32 [[LAST_INDEX_RET]] +; +; CHECK-REV-MIN-VW4-IL2-LABEL: define i32 @minlst( +; CHECK-REV-MIN-VW4-IL2-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) { +; CHECK-REV-MIN-VW4-IL2-NEXT: [[ENTRY:.*]]: +; CHECK-REV-MIN-VW4-IL2-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0 +; CHECK-REV-MIN-VW4-IL2-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]] +; CHECK-REV-MIN-VW4-IL2: [[LOOP_PREHEADER]]: +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64 +; CHECK-REV-MIN-VW4-IL2-NEXT: br label %[[LOOP:.*]] +; CHECK-REV-MIN-VW4-IL2: [[LOOP]]: +; CHECK-REV-MIN-VW4-IL2-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1 +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1 +; CHECK-REV-MIN-VW4-IL2-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]] +; CHECK-REV-MIN-VW4-IL2: [[__CRIT_EDGE_LOOPEXIT:.*:]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ] +; CHECK-REV-MIN-VW4-IL2-NEXT: br [[DOT_CRIT_EDGE]] +; CHECK-REV-MIN-VW4-IL2: [[__CRIT_EDGE:.*:]] +; CHECK-REV-MIN-VW4-IL2-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[SELECT_LCSSA]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ] +; CHECK-REV-MIN-VW4-IL2-NEXT: ret i32 [[LAST_INDEX_RET]] +; +; CHECK-LOOP-IDIOM-LABEL: define i32 @minlst( +; CHECK-LOOP-IDIOM-SAME: i32 [[FIRST_INDEX:%.*]], i32 [[LAST_INDEX:%.*]], ptr [[ARRAY:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-LOOP-IDIOM-NEXT: [[ENTRY:.*]]: +; CHECK-LOOP-IDIOM-NEXT: [[FIRST_INDEX_SEXT:%.*]] = sext i32 [[FIRST_INDEX]] to i64 +; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_NEG:%.*]] = sub i32 0, [[LAST_INDEX]] +; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_NEG_SEXT:%.*]] = sext i32 [[LAST_INDEX_NEG]] to i64 +; CHECK-LOOP-IDIOM-NEXT: [[ADD:%.*]] = add nsw i64 [[FIRST_INDEX_SEXT]], [[LAST_INDEX_NEG_SEXT]] +; CHECK-LOOP-IDIOM-NEXT: [[DIFF:%.*]] = sub nsw i64 0, [[ADD]] +; CHECK-LOOP-IDIOM-NEXT: [[FIRST_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -8 +; CHECK-LOOP-IDIOM-NEXT: [[SECOND_PTR:%.*]] = getelementptr i8, ptr [[ARRAY]], i64 -4 +; CHECK-LOOP-IDIOM-NEXT: [[EARLY_EXIT_COND:%.*]] = icmp slt i64 [[ADD]], 0 +; CHECK-LOOP-IDIOM-NEXT: br i1 [[EARLY_EXIT_COND]], label %[[LOOP_PREHEADER:.*]], [[DOT_CRIT_EDGE:label %.*]] +; CHECK-LOOP-IDIOM: [[LOOP_PREHEADER]]: +; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_SEXT:%.*]] = sext i32 [[LAST_INDEX]] to i64 +; CHECK-LOOP-IDIOM-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VLEN:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_NOT:%.*]] = sub i64 0, [[MINIDX_VLEN]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_AND:%.*]] = and i64 [[LAST_INDEX_SEXT]], [[MINIDX_NOT]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_UMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[MINIDX_AND]], i64 [[FIRST_INDEX_SEXT]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_ADD:%.*]] = add i64 [[LAST_INDEX_SEXT]], 1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[MINIDX_UMAX]], i64 [[MINIDX_ADD]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_UMAX_MINUS1:%.*]] = sub i64 [[MINIDX_UMAX]], 1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_ADD_PTR_I:%.*]] = getelementptr inbounds float, ptr [[FIRST_PTR]], i64 [[MINIDX_UMAX_MINUS1]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_LOADVALS:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[MINIDX_ADD_PTR_I]], i32 1, [[MINIDX_MASK]], zeroinitializer) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_CURRENTVALS:%.*]] = select [[MINIDX_MASK]], [[MINIDX_LOADVALS]], splat (float 0x7FF0000000000000) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i1( [[MINIDX_MASK]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_REVERSEVALS:%.*]] = call @llvm.vector.reverse.nxv4f32( [[MINIDX_CURRENTVALS]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_MINVAL:%.*]] = call float @llvm.vector.reduce.fminimum.nxv4f32( [[MINIDX_REVERSEVALS]]) +; CHECK-LOOP-IDIOM-NEXT: br i1 true, label %[[MINIDX_VEC_ENTRY:.*]], label %[[LOOP:.*]] +; CHECK-LOOP-IDIOM: [[LOOP]]: +; CHECK-LOOP-IDIOM-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX_SEXT]], %[[LOOP_PREHEADER]] ] +; CHECK-LOOP-IDIOM-NEXT: [[DEC_IV:%.*]] = phi i64 [ [[DEC:%.*]], %[[LOOP]] ], [ [[DIFF]], %[[LOOP_PREHEADER]] ] +; CHECK-LOOP-IDIOM-NEXT: [[INDEX:%.*]] = phi i32 [ [[SELECT:%.*]], %[[LOOP]] ], [ [[LAST_INDEX]], %[[LOOP_PREHEADER]] ] +; CHECK-LOOP-IDIOM-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1 +; CHECK-LOOP-IDIOM-NEXT: [[LOAD1_PTR:%.*]] = getelementptr float, ptr [[FIRST_PTR]], i64 [[IV]] +; CHECK-LOOP-IDIOM-NEXT: [[LOAD1:%.*]] = load float, ptr [[LOAD1_PTR]], align 4 +; CHECK-LOOP-IDIOM-NEXT: [[INDEX_SEXT:%.*]] = sext i32 [[INDEX]] to i64 +; CHECK-LOOP-IDIOM-NEXT: [[LOAD2_PTR:%.*]] = getelementptr float, ptr [[SECOND_PTR]], i64 [[INDEX_SEXT]] +; CHECK-LOOP-IDIOM-NEXT: [[LOAD2:%.*]] = load float, ptr [[LOAD2_PTR]], align 4 +; CHECK-LOOP-IDIOM-NEXT: [[CMP:%.*]] = fcmp contract olt float [[LOAD1]], [[LOAD2]] +; CHECK-LOOP-IDIOM-NEXT: [[IV_NEXT_TRUNC:%.*]] = trunc nsw i64 [[IV_NEXT]] to i32 +; CHECK-LOOP-IDIOM-NEXT: [[SELECT]] = select i1 [[CMP]], i32 [[IV_NEXT_TRUNC]], i32 [[INDEX]] +; CHECK-LOOP-IDIOM-NEXT: [[DEC]] = add nsw i64 [[DEC_IV]], -1 +; CHECK-LOOP-IDIOM-NEXT: [[LOOP_COND:%.*]] = icmp sgt i64 [[DEC_IV]], 1 +; CHECK-LOOP-IDIOM-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[DOT_CRIT_EDGE_LOOPEXIT:.*]] +; CHECK-LOOP-IDIOM: [[__CRIT_EDGE_LOOPEXIT:.*:]] +; CHECK-LOOP-IDIOM-NEXT: [[TMP1:%.*]] = phi i32 [ [[SELECT]], %[[LOOP]] ], [ [[MINIDX_RET_BITCAST:%.*]], %[[MINIDX_END:.*]] ] +; CHECK-LOOP-IDIOM-NEXT: br [[DOT_CRIT_EDGE]] +; CHECK-LOOP-IDIOM: [[__CRIT_EDGE:.*:]] +; CHECK-LOOP-IDIOM-NEXT: [[LAST_INDEX_RET:%.*]] = phi i32 [ [[LAST_INDEX]], %[[ENTRY]] ], [ [[TMP1]], %[[DOT_CRIT_EDGE_LOOPEXIT]] ] +; CHECK-LOOP-IDIOM-NEXT: ret i32 [[LAST_INDEX_RET]] +; CHECK-LOOP-IDIOM: [[MINIDX_VEC_ENTRY]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_ENTRY_CMP:%.*]] = fcmp olt float [[MINIDX_MINVAL]], 0x7FF0000000000000 +; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_ENTRY_CMP]], label %[[MINIDX_PARTIAL_1_IF:.*]], label %[[MINIDX_PARTIAL_1_PROC_EXIT:.*]] +; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_1_IF]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINVAL_SPLAT_SPLATINSERT:%.*]] = insertelement poison, float [[MINIDX_MINVAL]], i64 0 +; CHECK-LOOP-IDIOM-NEXT: [[MINVAL_SPLAT_SPLAT:%.*]] = shufflevector [[MINVAL_SPLAT_SPLATINSERT]], poison, zeroinitializer +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_CMP:%.*]] = fcmp oeq [[MINIDX_REVERSEVALS]], [[MINVAL_SPLAT_SPLAT]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_AND:%.*]] = and [[MINIDX_REVERSE]], [[MINIDX_PARTIAL_1_CMP]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_CTTZ:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( [[MINIDX_PARTIAL_1_AND]], i1 true) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_TMP:%.*]] = sub i64 [[MINIDX_VLEN]], [[MINIDX_PARTIAL_1_CTTZ]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_TMP_MINUS1:%.*]] = sub i64 [[MINIDX_PARTIAL_1_TMP]], 1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_ADD2:%.*]] = add i64 [[MINIDX_UMAX]], [[MINIDX_PARTIAL_1_TMP_MINUS1]] +; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_PARTIAL_1_PROC_EXIT]] +; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_1_PROC_EXIT]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_EXIT_KNOWN_MIN:%.*]] = phi float [ [[MINIDX_MINVAL]], %[[MINIDX_PARTIAL_1_IF]] ], [ 0x7FF0000000000000, %[[MINIDX_VEC_ENTRY]] ] +; CHECK-LOOP-IDIOM-NEXT: [[PARTIAL1_EXIT_KNOWN_ARG:%.*]] = phi i64 [ [[MINIDX_PARTIAL_1_ADD2]], %[[MINIDX_PARTIAL_1_IF]] ], [ 0, %[[MINIDX_VEC_ENTRY]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_PROC_EXIT_ADD:%.*]] = add i64 [[MINIDX_VLEN]], [[FIRST_INDEX_SEXT]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_1_PROC_EXIT_ICMP:%.*]] = icmp ult i64 [[MINIDX_UMAX]], [[MINIDX_PARTIAL_1_PROC_EXIT_ADD]] +; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_PARTIAL_1_PROC_EXIT_ICMP]], label %[[MINIDX_VECT_END:.*]], label %[[MINIDX_WHILE_BODY_PH:.*]] +; CHECK-LOOP-IDIOM: [[MINIDX_WHILE_BODY_PH]]: +; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_VECT_BODY:.*]] +; CHECK-LOOP-IDIOM: [[MINIDX_VECT_BODY]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_PHI1:%.*]] = phi i64 [ [[MINIDX_UMAX]], %[[MINIDX_WHILE_BODY_PH]] ], [ [[MINIDX_VECT_BODY_SUB:%.*]], %[[MINIDX_VECT_CONTINUE:.*]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_KNOWN_ARG:%.*]] = phi i64 [ [[PARTIAL1_EXIT_KNOWN_ARG]], %[[MINIDX_WHILE_BODY_PH]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_ARG:%.*]], %[[MINIDX_VECT_CONTINUE]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_KNOWN_MIN:%.*]] = phi float [ [[MINIDX_PARTIAL_1_EXIT_KNOWN_MIN]], %[[MINIDX_WHILE_BODY_PH]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_MIN:%.*]], %[[MINIDX_VECT_CONTINUE]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_SUB]] = sub i64 [[MINIDX_VECT_BODY_PHI1]], [[MINIDX_VLEN]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_SUB_MINUS1:%.*]] = sub i64 [[MINIDX_VECT_BODY_SUB]], 1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_GEP:%.*]] = getelementptr inbounds float, ptr [[FIRST_PTR]], i64 [[MINIDX_VECT_BODY_SUB_MINUS1]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_UNMASKEDLOAD:%.*]] = load , ptr [[MINIDX_VECT_BODY_GEP]], align 16 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[MINIDX_VECT_BODY_UNMASKEDLOAD]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_REDUCE:%.*]] = call float @llvm.vector.reduce.fminimum.nxv4f32( [[MINIDX_VECT_BODY_REVERSE]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_FCMP:%.*]] = fcmp olt float [[MINIDX_VECT_BODY_REDUCE]], [[MINIDX_VECT_BODY_KNOWN_MIN]] +; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_VECT_BODY_FCMP]], label %[[MINIDX_VECT_UPDATE:.*]], label %[[MINIDX_VECT_CONTINUE]] +; CHECK-LOOP-IDIOM: [[MINIDX_VECT_UPDATE]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLATINSERT:%.*]] = insertelement poison, float [[MINIDX_VECT_BODY_REDUCE]], i64 0 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLAT:%.*]] = shufflevector [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLATINSERT]], poison, zeroinitializer +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_FCMP:%.*]] = fcmp ueq [[MINIDX_VECT_BODY_REVERSE]], [[MINIDX_VECT_UPDATE_SPLATINSERT_SPLAT]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_CTTZ:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( [[MINIDX_VECT_UPDATE_FCMP]], i1 true) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_MUL:%.*]] = mul i64 [[MINIDX_VECT_UPDATE_CTTZ]], -1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_UPDATE_ADD:%.*]] = add i64 [[MINIDX_VECT_BODY_PHI1]], [[MINIDX_VECT_UPDATE_MUL]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_BODY_ADD2:%.*]] = add i64 [[MINIDX_VECT_UPDATE_ADD]], -1 +; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_VECT_CONTINUE]] +; CHECK-LOOP-IDIOM: [[MINIDX_VECT_CONTINUE]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_CONTINUE_KNOWN_MIN]] = phi float [ [[MINIDX_VECT_BODY_REDUCE]], %[[MINIDX_VECT_UPDATE]] ], [ [[MINIDX_VECT_BODY_KNOWN_MIN]], %[[MINIDX_VECT_BODY]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_CONTINUE_KNOWN_ARG]] = phi i64 [ [[MINIDX_VECT_BODY_ADD2]], %[[MINIDX_VECT_UPDATE]] ], [ [[MINIDX_VECT_BODY_KNOWN_ARG]], %[[MINIDX_VECT_BODY]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_CONTINUE_ICMP:%.*]] = icmp ult i64 [[MINIDX_VECT_BODY_SUB]], [[MINIDX_PARTIAL_1_PROC_EXIT_ADD]] +; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_VECT_CONTINUE_ICMP]], label %[[MINIDX_VECT_END]], label %[[MINIDX_VECT_BODY]] +; CHECK-LOOP-IDIOM: [[MINIDX_VECT_END]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_KNOWN_MIN_LCSSA:%.*]] = phi float [ [[MINIDX_PARTIAL_1_EXIT_KNOWN_MIN]], %[[MINIDX_PARTIAL_1_PROC_EXIT]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_MIN]], %[[MINIDX_VECT_CONTINUE]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_KNOWN_ARG_LCSSA:%.*]] = phi i64 [ [[PARTIAL1_EXIT_KNOWN_ARG]], %[[MINIDX_PARTIAL_1_PROC_EXIT]] ], [ [[MINIDX_VECT_CONTINUE_KNOWN_ARG]], %[[MINIDX_VECT_CONTINUE]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_LCSSA:%.*]] = phi i64 [ [[MINIDX_UMAX]], %[[MINIDX_PARTIAL_1_PROC_EXIT]] ], [ [[MINIDX_VECT_BODY_SUB]], %[[MINIDX_VECT_CONTINUE]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_VECT_END_CMP:%.*]] = icmp ugt i64 [[MINIDX_VECT_END_LCSSA]], [[FIRST_INDEX_SEXT]] +; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_VECT_END_CMP]], label %[[MINIDX_PARTIAL_2_IF:.*]], label %[[MINIDX_END]] +; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_2_IF]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_ADD_ZERO:%.*]] = add i64 [[MINIDX_VECT_END_LCSSA]], 0 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[FIRST_INDEX_SEXT]], i64 [[MINIDX_PARTIAL_2_IF_ADD_ZERO]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_MASK_REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i1( [[MINIDX_PARTIAL_2_IF_MASK]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_IPOS1_MINUS1:%.*]] = sub i64 [[FIRST_INDEX_SEXT]], 1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_GEP:%.*]] = getelementptr inbounds float, ptr [[FIRST_PTR]], i64 [[MINIDX_PARTIAL_2_IF_IPOS1_MINUS1]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[MINIDX_PARTIAL_2_IF_GEP]], i32 1, [[MINIDX_PARTIAL_2_IF_MASK_REVERSE]], zeroinitializer) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_REVERSE:%.*]] = call @llvm.vector.reverse.nxv4f32( [[MINIDX_PARTIAL_2_IF_LOAD]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_REDUCE:%.*]] = call float @llvm.vector.reduce.fminimum.nxv4f32( [[MINIDX_PARTIAL_2_IF_REVERSE]]) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_IF_FCMP:%.*]] = fcmp olt float [[MINIDX_PARTIAL_2_IF_REDUCE]], [[MINIDX_VECT_END_KNOWN_MIN_LCSSA]] +; CHECK-LOOP-IDIOM-NEXT: br i1 [[MINIDX_PARTIAL_2_IF_FCMP]], label %[[MINIDX_PARTIAL_2_EXIT:.*]], label %[[MINIDX_END]] +; CHECK-LOOP-IDIOM: [[MINIDX_PARTIAL_2_EXIT]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLATINSERT:%.*]] = insertelement poison, float [[MINIDX_PARTIAL_2_IF_REDUCE]], i64 0 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLAT:%.*]] = shufflevector [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLATINSERT]], poison, zeroinitializer +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_FCMP:%.*]] = fcmp oeq [[MINIDX_PARTIAL_2_IF_REVERSE]], [[MINIDX_PARTIAL_2_EXIT_SPLATINSERT_SPLAT]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_AND:%.*]] = and [[MINIDX_PARTIAL_2_EXIT_FCMP]], [[MINIDX_PARTIAL_2_IF_MASK_REVERSE]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_CTTZ:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1( [[MINIDX_PARTIAL_2_EXIT_AND]], i1 true) +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_TMP:%.*]] = sub i64 [[MINIDX_VLEN]], [[MINIDX_PARTIAL_2_EXIT_CTTZ]] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_TMP_MINUS1:%.*]] = sub i64 [[MINIDX_PARTIAL_2_EXIT_TMP]], 1 +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_PARTIAL_2_EXIT_ADD2:%.*]] = add i64 [[FIRST_INDEX_SEXT]], [[MINIDX_PARTIAL_2_EXIT_TMP_MINUS1]] +; CHECK-LOOP-IDIOM-NEXT: br label %[[MINIDX_END]] +; CHECK-LOOP-IDIOM: [[MINIDX_END]]: +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_RET:%.*]] = phi i64 [ [[MINIDX_VECT_END_KNOWN_ARG_LCSSA]], %[[MINIDX_VECT_END]] ], [ [[MINIDX_PARTIAL_2_EXIT_ADD2]], %[[MINIDX_PARTIAL_2_EXIT]] ], [ [[MINIDX_VECT_END_KNOWN_ARG_LCSSA]], %[[MINIDX_PARTIAL_2_IF]] ] +; CHECK-LOOP-IDIOM-NEXT: [[MINIDX_RET_BITCAST]] = trunc i64 [[MINIDX_RET]] to i32 +; CHECK-LOOP-IDIOM-NEXT: br label %[[DOT_CRIT_EDGE_LOOPEXIT]] +; +entry: + %first_index_sext = sext i32 %first_index to i64 + %last_index_neg = sub i32 0, %last_index + %last_index_neg_sext = sext i32 %last_index_neg to i64 + %add = add nsw i64 %first_index_sext, %last_index_neg_sext + %diff = sub nsw i64 0, %add + %first_ptr = getelementptr i8, ptr %array, i64 -8 + %second_ptr = getelementptr i8, ptr %array, i64 -4 + %early_exit_cond = icmp slt i64 %add, 0 + br i1 %early_exit_cond, label %loop.preheader, label %._crit_edge + +loop.preheader: ; preds = %entry + %last_index_sext = sext i32 %last_index to i64 + br label %loop + +loop: ; preds = %loop.preheader, %loop + %iv = phi i64 [%iv.next, %loop], [ %last_index_sext, %loop.preheader ] + %dec_iv = phi i64 [ %dec, %loop ], [ %diff, %loop.preheader ] + %index = phi i32 [ %select, %loop ], [ %last_index, %loop.preheader ] + %iv.next = add nsw i64 %iv, -1 + %load1_ptr = getelementptr float, ptr %first_ptr, i64 %iv + %load1 = load float, ptr %load1_ptr, align 4 + %index_sext = sext i32 %index to i64 + %load2_ptr = getelementptr float, ptr %second_ptr, i64 %index_sext + %load2 = load float, ptr %load2_ptr, align 4 + %cmp = fcmp contract olt float %load1, %load2 + %iv.next.trunc = trunc nsw i64 %iv.next to i32 + %select = select i1 %cmp, i32 %iv.next.trunc, i32 %index + %dec = add nsw i64 %dec_iv, -1 + %loop_cond = icmp sgt i64 %dec_iv, 1 + br i1 %loop_cond, label %loop, label %._crit_edge + +._crit_edge: ; preds = %loop, %entry + %last_index_ret = phi i32 [ %select, %loop ], [ %last_index, %entry ] + ret i32 %last_index_ret +}