Skip to content

Commit f15d348

Browse files
committed
[VPlan] Introduces explicit broadcast for live-in constants.
This patch focus on explicit show the broadcast for the live-in constants. This can help the VPlan-based cost model the broadcast cost and track the register pressure of the broadcast value in the future. Live-in constants usually only has single user so insert the `broadcast` before the user to reduce the live range of the broadcast value and prevent generated vector IR changes.
1 parent 4af96a9 commit f15d348

File tree

8 files changed

+64
-20
lines changed

8 files changed

+64
-20
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,17 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
13391339
void print(raw_ostream &O, const Twine &Indent,
13401340
VPSlotTracker &SlotTracker) const override;
13411341
#endif
1342+
1343+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1344+
assert(is_contained(operands(), Op) &&
1345+
"Op must be an operand of the recipe");
1346+
switch (Opcode) {
1347+
default:
1348+
return false;
1349+
case Instruction::ExtractValue:
1350+
return Op == getOperand(1);
1351+
}
1352+
}
13421353
};
13431354

13441355
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
@@ -1533,6 +1544,14 @@ class VPWidenCallRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
15331544
void print(raw_ostream &O, const Twine &Indent,
15341545
VPSlotTracker &SlotTracker) const override;
15351546
#endif
1547+
1548+
/// Returns true if the recipe only uses the first lane of operand \p Op.
1549+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
1550+
assert(is_contained(operands(), Op) &&
1551+
"Op must be an operand of the recipe");
1552+
// Scalar called fuction cannot be vectorized.
1553+
return Op == getOperand(getNumOperands() - 1);
1554+
}
15361555
};
15371556

15381557
/// A recipe representing a sequence of load -> update -> store as part of

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
938938
default:
939939
return false;
940940
case Instruction::ExtractElement:
941+
case Instruction::ExtractValue:
941942
return Op == getOperand(1);
942943
case Instruction::PHI:
943944
return true;
@@ -959,8 +960,9 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
959960
case VPInstruction::PtrAdd:
960961
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
961962
case VPInstruction::ComputeAnyOfResult:
962-
case VPInstruction::ComputeFindLastIVResult:
963963
return Op == getOperand(1);
964+
case VPInstruction::ComputeFindLastIVResult:
965+
return Op == getOperand(1) || Op == getOperand(2);
964966
};
965967
llvm_unreachable("switch should return");
966968
}

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3073,10 +3073,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
30733073

30743074
auto *VectorPreheader = Plan.getVectorPreheader();
30753075
for (VPValue *VPV : VPValues) {
3076-
if (all_of(VPV->users(),
3077-
[VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
3078-
(VPV->isLiveIn() && VPV->getLiveInIRValue() &&
3079-
isa<Constant>(VPV->getLiveInIRValue())))
3076+
if (all_of(VPV->users(), [VPV](VPUser *U) { return U->usesScalars(VPV); }))
30803077
continue;
30813078

30823079
// Add explicit broadcast at the insert point that dominates all users.
@@ -3093,8 +3090,25 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
30933090
"All users must be in the vector preheader or dominated by it");
30943091
}
30953092

3096-
VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3097-
auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3093+
VPInstruction *Broadcast;
3094+
if (VPV->isLiveIn() && isa_and_nonnull<Constant>(VPV->getLiveInIRValue())) {
3095+
// We cannot replace the constant live-ins for PHIs by broadcast in the
3096+
// same VPBB because it will break PHI. Also cannot replace the
3097+
// VPWidenGEPRecipe since it broadcasts the generated pointer instead of
3098+
// operands.
3099+
if (auto *R = dyn_cast_if_present<VPRecipeBase>(*(VPV->users().begin()));
3100+
R && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPWidenGEPRecipe>(R) &&
3101+
!VPV->hasMoreThanOneUniqueUser()) {
3102+
Broadcast = new VPInstruction(VPInstruction::Broadcast, {VPV});
3103+
// Insert just before the user to reduce register pressure.
3104+
Broadcast->insertBefore(R);
3105+
} else {
3106+
continue;
3107+
}
3108+
} else {
3109+
VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
3110+
Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
3111+
}
30983112
VPV->replaceUsesWithIf(Broadcast,
30993113
[VPV, Broadcast](VPUser &U, unsigned Idx) {
31003114
return Broadcast != &U && !U.usesScalars(VPV);

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,11 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
239239
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
240240
; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
241241
; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
242-
; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, ir<1>
242+
; CHECK-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1>
243+
; CHECK-NEXT: WIDEN ir<%add9> = add ir<%19>, vp<[[BROADCAST]]>
243244
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
244-
; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
245-
; CHECK-NEXT: WIDEN store vp<%5>, ir<%add9>
245+
; CHECK-NEXT: vp<[[VECTOR_END_POINTER1:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
246+
; CHECK-NEXT: WIDEN store vp<[[VECTOR_END_POINTER1]]>, ir<%add9>
246247
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
247248
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
248249
; CHECK-NEXT: Successor(s): middle.block, vector.body
@@ -648,10 +649,11 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
648649
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
649650
; CHECK-NEXT: vp<%4> = vector-end-pointer inbounds ir<%arrayidx>, ir<%18>
650651
; CHECK-NEXT: WIDEN ir<%19> = load vp<%4>
651-
; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, ir<1.000000e+00>
652+
; CHECK-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1.000000e+00>
653+
; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%19>, vp<[[BROADCAST]]>
652654
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
653-
; CHECK-NEXT: vp<%5> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
654-
; CHECK-NEXT: WIDEN store vp<%5>, ir<%conv1>
655+
; CHECK-NEXT: vp<[[VECTOR_END_POINTER:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<%18>
656+
; CHECK-NEXT: WIDEN store vp<[[VECTOR_END_POINTER]]>, ir<%conv1>
655657
; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<%18>.1
656658
; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<%n.vec>
657659
; CHECK-NEXT: Successor(s): middle.block, vector.body

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-inloop-reduction.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1847,7 +1847,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
18471847
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
18481848
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
18491849
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
1850-
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
1850+
; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
1851+
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
18511852
; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
18521853
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
18531854
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -1967,7 +1968,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
19671968
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0
19681969
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP10]])
19691970
; IF-EVL-NEXT: [[TMP14:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
1970-
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
1971+
; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]]
1972+
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]])
19711973
; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP10]] to i64
19721974
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP17]], [[EVL_BASED_IV]]
19731975
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reduction.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,7 +1901,8 @@ define i32 @anyof_icmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
19011901
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
19021902
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
19031903
; IF-EVL-NEXT: [[TMP13:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 3)
1904-
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
1904+
; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
1905+
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
19051906
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
19061907
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
19071908
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
@@ -2021,7 +2022,8 @@ define i32 @anyof_fcmp(ptr %a, i64 %n, i32 %start, i32 %inv) {
20212022
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
20222023
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x float> @llvm.vp.load.nxv4f32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP9]])
20232024
; IF-EVL-NEXT: [[TMP13:%.*]] = fcmp fast olt <vscale x 4 x float> [[VP_OP_LOAD]], splat (float 3.000000e+00)
2024-
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
2025+
; IF-EVL-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP13]]
2026+
; IF-EVL-NEXT: [[TMP15]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP9]])
20252027
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
20262028
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
20272029
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]

llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@
4242
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
4343
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
4444
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
45-
; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]>
45+
; IF-EVL-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<0>
46+
; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub vp<[[BROADCAST]]>, ir<[[LD2]]>
4647
; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>)
4748
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]>
4849
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[EVL_PHI]]>

llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
2323
; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
2424
; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
2525
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
26-
; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
27-
; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
26+
; CHECK-NEXT: EMIT vp<[[BROADCAST1:%.+]]> = broadcast ir<-12>
27+
; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST1]]>
28+
; CHECK-NEXT: EMIT vp<[[BROADCAST2:%.+]]> = broadcast ir<13>
29+
; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST2]]>
2830
; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
2931
; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]>
3032
; CHECK-NEXT: Successor(s): pred.store

0 commit comments

Comments
 (0)