Skip to content

[LV] Add a statistic for early exit vectorization #145730

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");

static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -7324,6 +7325,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
"Trying to execute plan with unsupported VF");
assert(BestVPlan.hasUF(BestUF) &&
"Trying to execute plan with unsupported UF");
++LoopsVectorized;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to leave the other increments as-is for now.

Doesn't this change now increments LoopsVectorized twice per loop if the epilogue gets vectorized?

if (BestVPlan.hasEarlyExit())
++LoopsEarlyExitVectorized;
if (VectorizingEpilogue)
++LoopsEpilogueVectorized;
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
Expand Down Expand Up @@ -10259,7 +10265,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);

LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT,
/*VectorizingEpilogue*/ false);

ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
Expand Down Expand Up @@ -10288,9 +10295,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI, &CM, BFI, PSI, Checks,
*BestMainPlan);
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;
auto ExpandedSCEVs =
LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan,
MainILV, DT, /*VectorizingEpilogue*/ false);

// Second pass vectorizes the epilogue and adjusts the control flow
// edges from the first pass.
Expand All @@ -10303,7 +10310,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);

LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true);
DT, /*VectorizingEpilogue*/ true);

// Fix induction resume values from the additional bypass block.
BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
Expand All @@ -10318,7 +10325,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// TODO: Directly add as extra operand to the VPResumePHI recipe.
Inc->setIncomingValueForBlock(BypassBlock, V);
}
++LoopsEpilogueVectorized;

if (!Checks.hasChecks())
DisableRuntimeUnroll = true;
Expand All @@ -10327,7 +10333,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
Checks, BestPlan);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;

// Add metadata to disable runtime unrolling a scalar loop when there
// are no runtime checks about strides and memory. A scalar loop that is
Expand Down
47 changes: 41 additions & 6 deletions llvm/test/Transforms/LoopVectorize/vect.stats.ll
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize --disable-output -stats -S 2>&1 | FileCheck %s
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization=1 --disable-output -stats -S 2>&1 | FileCheck %s
; REQUIRES: asserts

;
; We have 2 loops, one of them is vectorizable and the second one is not.
;
; We have 3 loops, two of them are vectorizable (with one being early-exit
; vectorized) and the third one is not.

; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization
; CHECK: 1 loop-vectorize - Number of loops vectorized
; CHECK: 3 loop-vectorize - Number of loops analyzed for vectorization
; CHECK: 1 loop-vectorize - Number of early exit loops vectorized
; CHECK: 2 loop-vectorize - Number of loops vectorized

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

Expand All @@ -31,6 +31,39 @@ for.end: ; preds = %entry, %for.body
ret void
}

define i32 @early_exit_vectorized(i32 %end) {
entry:
%p1 = alloca [1024 x i32]
%p2 = alloca [1024 x i32]
call void @init_mem(ptr %p1, i64 1024)
call void @init_mem(ptr %p2, i64 1024)
%end.clamped = and i32 %end, 1023
br label %for.body

for.body:
%ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
%gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
%arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
%0 = load i32, ptr %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
%1 = load i32, ptr %arrayidx2, align 4
%cmp.early = icmp eq i32 %0, %1
br i1 %cmp.early, label %found, label %for.inc

for.inc:
%ind.next = add i8 %ind, 1
%conv = zext i8 %ind.next to i32
%gep.ind.next = add i64 %gep.ind, 1
%cmp = icmp ult i32 %conv, %end.clamped
br i1 %cmp, label %for.body, label %exit

found:
ret i32 1

exit:
ret i32 0
}

define void @not_vectorized(ptr nocapture %a, i64 %size) {
entry:
%cmp1 = icmp sle i64 %size, 0
Expand All @@ -56,3 +89,5 @@ for.body: ; preds = %entry, %for.body
for.end: ; preds = %entry, %for.body
ret void
}

declare void @init_mem(ptr, i64);
Loading