Skip to content

Commit 0fc357f

Browse files
committed
[LV] Add a statistic for early exit vectorization
We currently do not vectorize the epilog loops with early-exits, but the stats are updated there as well for completeness.
1 parent c92b580 commit 0fc357f

File tree

2 files changed

+53
-13
lines changed

2 files changed

+53
-13
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
175175
STATISTIC(LoopsVectorized, "Number of loops vectorized");
176176
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177177
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178+
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
178179

179180
static cl::opt<bool> EnableEpilogueVectorization(
180181
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -7324,6 +7325,11 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
73247325
"Trying to execute plan with unsupported VF");
73257326
assert(BestVPlan.hasUF(BestUF) &&
73267327
"Trying to execute plan with unsupported UF");
7328+
++LoopsVectorized;
7329+
if (BestVPlan.hasEarlyExit())
7330+
++LoopsEarlyExitVectorized;
7331+
if (VectorizingEpilogue)
7332+
++LoopsEpilogueVectorized;
73277333
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
73287334
// cost model is complete for better cost estimates.
73297335
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
@@ -10259,7 +10265,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1025910265
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
1026010266
ElementCount::getFixed(1), IC, &CM, BFI, PSI, Checks, BestPlan);
1026110267

10262-
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10268+
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT,
10269+
/*VectorizingEpilogue*/ false);
1026310270

1026410271
ORE->emit([&]() {
1026510272
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10288,9 +10295,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1028810295
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
1028910296
EPI, &CM, BFI, PSI, Checks,
1029010297
*BestMainPlan);
10291-
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
10292-
*BestMainPlan, MainILV, DT, false);
10293-
++LoopsVectorized;
10298+
auto ExpandedSCEVs =
10299+
LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, *BestMainPlan,
10300+
MainILV, DT, /*VectorizingEpilogue*/ false);
1029410301

1029510302
// Second pass vectorizes the epilogue and adjusts the control flow
1029610303
// edges from the first pass.
@@ -10303,7 +10310,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1030310310
preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
1030410311

1030510312
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10306-
DT, true);
10313+
DT, /*VectorizingEpilogue*/ true);
1030710314

1030810315
// Fix induction resume values from the additional bypass block.
1030910316
BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
@@ -10318,7 +10325,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1031810325
// TODO: Directly add as extra operand to the VPResumePHI recipe.
1031910326
Inc->setIncomingValueForBlock(BypassBlock, V);
1032010327
}
10321-
++LoopsEpilogueVectorized;
1032210328

1032310329
if (!Checks.hasChecks())
1032410330
DisableRuntimeUnroll = true;
@@ -10327,7 +10333,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1032710333
VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
1032810334
Checks, BestPlan);
1032910335
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10330-
++LoopsVectorized;
1033110336

1033210337
// Add metadata to disable runtime unrolling a scalar loop when there
1033310338
// are no runtime checks about strides and memory. A scalar loop that is

llvm/test/Transforms/LoopVectorize/vect.stats.ll

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize --disable-output -stats -S 2>&1 | FileCheck %s
1+
; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -debug-only=loop-vectorize -enable-early-exit-vectorization=1 --disable-output -stats -S 2>&1 | FileCheck %s
22
; REQUIRES: asserts
33

4-
;
5-
; We have 2 loops, one of them is vectorizable and the second one is not.
6-
;
4+
; We have 3 loops, two of them are vectorizable (with one being early-exit
5+
; vectorized) and the third one is not.
76

8-
; CHECK: 2 loop-vectorize - Number of loops analyzed for vectorization
9-
; CHECK: 1 loop-vectorize - Number of loops vectorized
7+
; CHECK: 3 loop-vectorize - Number of loops analyzed for vectorization
8+
; CHECK: 1 loop-vectorize - Number of early exit loops vectorized
9+
; CHECK: 2 loop-vectorize - Number of loops vectorized
1010

1111
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
1212

@@ -31,6 +31,39 @@ for.end: ; preds = %entry, %for.body
3131
ret void
3232
}
3333

34+
define i32 @early_exit_vectorized(i32 %end) {
35+
entry:
36+
%p1 = alloca [1024 x i32]
37+
%p2 = alloca [1024 x i32]
38+
call void @init_mem(ptr %p1, i64 1024)
39+
call void @init_mem(ptr %p2, i64 1024)
40+
%end.clamped = and i32 %end, 1023
41+
br label %for.body
42+
43+
for.body:
44+
%ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
45+
%gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
46+
%arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
47+
%0 = load i32, ptr %arrayidx1, align 4
48+
%arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
49+
%1 = load i32, ptr %arrayidx2, align 4
50+
%cmp.early = icmp eq i32 %0, %1
51+
br i1 %cmp.early, label %found, label %for.inc
52+
53+
for.inc:
54+
%ind.next = add i8 %ind, 1
55+
%conv = zext i8 %ind.next to i32
56+
%gep.ind.next = add i64 %gep.ind, 1
57+
%cmp = icmp ult i32 %conv, %end.clamped
58+
br i1 %cmp, label %for.body, label %exit
59+
60+
found:
61+
ret i32 1
62+
63+
exit:
64+
ret i32 0
65+
}
66+
3467
define void @not_vectorized(ptr nocapture %a, i64 %size) {
3568
entry:
3669
%cmp1 = icmp sle i64 %size, 0
@@ -56,3 +89,5 @@ for.body: ; preds = %entry, %for.body
5689
for.end: ; preds = %entry, %for.body
5790
ret void
5891
}
92+
93+
declare void @init_mem(ptr, i64);

0 commit comments

Comments
 (0)