-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[VPlan] Introduces explicit broadcast for live-in constants. #133213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[VPlan] Introduces explicit broadcast for live-in constants. #133213
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-risc-v Author: Elvis Wang (ElvisWang123) ChangesThis patch focus on represent the broadcast for the live-in constants explicitly. This can help the VPlan-based cost model the broadcast cost and track the register pressure of the broadcast value in the future. This patch will not change the generated vector IR and it only changes the output of VPlan. Note that Full diff: https://github.com/llvm/llvm-project/pull/133213.diff 6 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a7c85d30ba9f0..026f5b349987d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1127,6 +1127,17 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ switch (Opcode) {
+ default:
+ return false;
+ case Instruction::ExtractValue:
+ return Op == getOperand(1);
+ }
+ }
};
/// VPWidenCastRecipe is a recipe to create vector cast instructions.
@@ -1365,6 +1376,14 @@ class VPWidenCallRecipe : public VPRecipeWithIRFlags {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ // Scalar called fuction cannot be vectorized.
+ return Op == getOperand(getNumOperands() - 1);
+ }
};
/// A recipe representing a sequence of load -> update -> store as part of
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index cdef7972f3bdc..f992496735f0e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -915,6 +915,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
default:
return false;
case Instruction::ExtractElement:
+ case Instruction::ExtractValue:
+ case VPInstruction::ExtractFromEnd:
return Op == getOperand(1);
case Instruction::PHI:
return true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 67f77b41e878a..9df744a0629c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2207,10 +2207,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
auto *VectorPreheader = Plan.getVectorPreheader();
for (VPValue *VPV : VPValues) {
- if (all_of(VPV->users(),
- [VPV](VPUser *U) { return U->usesScalars(VPV); }) ||
- (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
- isa<Constant>(VPV->getLiveInIRValue())))
+ if (all_of(VPV->users(), [VPV](VPUser *U) { return U->usesScalars(VPV); }))
continue;
// Add explicit broadcast at the insert point that dominates all users.
@@ -2227,8 +2224,25 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
"All users must be in the vector preheader or dominated by it");
}
- VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
- auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
+ VPInstruction *Broadcast;
+ if (VPV->isLiveIn() && isa_and_nonnull<Constant>(VPV->getLiveInIRValue())) {
+ // We cannot replace the constant live-ins for PHIs by broadcast in the
+ // same VPBB because it will break PHI. Also cannot replace the
+ // VPWidenGEPRecipe since it broadcasts the generated pointer instead of
+ // operands.
+ if (auto *R = dyn_cast_if_present<VPRecipeBase>(*(VPV->users().begin()));
+ R && !isa<VPHeaderPHIRecipe, VPWidenPHIRecipe, VPWidenGEPRecipe>(R) &&
+ !VPV->hasMoreThanOneUniqueUser()) {
+ Broadcast = new VPInstruction(VPInstruction::Broadcast, {VPV});
+ // Insert just before the user to reduce register pressure.
+ Broadcast->insertBefore(R);
+ } else {
+ continue;
+ }
+ } else {
+ VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
+ Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
+ }
VPV->replaceUsesWithIf(Broadcast,
[VPV, Broadcast](VPUser &U, unsigned Idx) {
return Broadcast != &U && !U.usesScalars(VPV);
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 95df397ecdf41..8d79d18bc7f57 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -201,7 +201,8 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1>
+; CHECK-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1>
+; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, vp<[[BROADCAST]]>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9>
@@ -450,7 +451,8 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx>, ir<[[VF]]>
; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]>
-; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00>
+; CHECK-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<1.000000e+00>
+; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, vp<[[BROADCAST]]>
; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom>
; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-end-pointer inbounds ir<%arrayidx3>, ir<[[VF]]>
; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index dfc2fffdad2bb..cddf2f0a00cfa 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -44,7 +44,8 @@
; IF-EVL-NEXT: vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
; IF-EVL-NEXT: WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
; IF-EVL-NEXT: WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
- ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub ir<0>, ir<[[LD2]]>
+ ; IF-EVL-NEXT: EMIT vp<[[BROADCAST:%.+]]> = broadcast ir<0>
+ ; IF-EVL-NEXT: WIDEN ir<[[SUB:%.+]]> = sub vp<[[BROADCAST]]>, ir<[[LD2]]>
; IF-EVL-NEXT: WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<[[LD2]]>, ir<[[SUB]]>, vp<[[EVL]]>)
; IF-EVL-NEXT: WIDEN ir<[[ADD:%.+]]> = add vp<[[SELECT]]>, ir<[[LD1]]>
; IF-EVL-NEXT: CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
index 8374ac88c8bad..299ba9c8a0e95 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll
@@ -23,8 +23,10 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) {
; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]>
; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]>
; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]>
-; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12>
-; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13>
+; CHECK-NEXT: EMIT vp<[[BROADCAST1:%.+]]> = broadcast ir<-12>
+; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST1]]>
+; CHECK-NEXT: EMIT vp<[[BROADCAST2:%.+]]> = broadcast ir<13>
+; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, vp<[[BROADCAST2]]>
; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]>
; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]>
; CHECK-NEXT: Successor(s): pred.store
|
This patch focus on explicit show the broadcast for the live-in constants. This can help the VPlan-based cost model the broadcast cost and track the register pressure of the broadcast value in the future. Live-in constants usually only has single user so insert the `broadcast` before the user to reduce the live range of the broadcast value and prevent generated vector IR changes.
5d523fb
to
f15d348
Compare
Gentle ping :) |
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
// Scalar called fuction cannot be vectorized. | ||
return Op == getOperand(getNumOperands() - 1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If I understand correctly, the last operand is the called scalar function, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is the function pointer of the scalar function.
@@ -1339,6 +1339,17 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { | |||
void print(raw_ostream &O, const Twine &Indent, | |||
VPSlotTracker &SlotTracker) const override; | |||
#endif | |||
|
|||
bool onlyFirstLaneUsed(const VPValue *Op) const override { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if it's worth pulling the VPlan.h and VPlanRecipes.cpp changes out into a separate PR as they make sense on their own? That way we can see what test changes are due to which code changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Split off to #145449, thanks!
auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV}); | ||
VPInstruction *Broadcast; | ||
if (VPV->isLiveIn() && isa_and_nonnull<Constant>(VPV->getLiveInIRValue())) { | ||
// We cannot replace the constant live-ins for PHIs by broadcast in the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I understand the relevance of PHIs in this code, since nothing else in this function seems to use or create them?
!VPV->hasMoreThanOneUniqueUser()) { | ||
Broadcast = new VPInstruction(VPInstruction::Broadcast, {VPV}); | ||
// Insert just before the user to reduce register pressure. | ||
Broadcast->insertBefore(R); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't there a problem here? It looks like you're assuming that the first user of VPV dominates all users of VPV, but I don't think that is always true. The HoistPoint
above was chosen as a location to dominate all users so why not just use that instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For register pressure, if hoisting all of constant broadcast to the preheader, the register pressure will be high in the loop body.
The register allocated in the preheader will mark as used across entire vector body (perhaps not now, but more accurate in future), which is not that accurate. If the broadcast instruction only in loop body, the register usage can be free when it reach the usage.
Probably need to limit all the user of the constant need to in the same BB.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you share a bit more on the follow-up motivation?
I'm slightly worried that this pessimizes results in some cases, as some constants can be freely broadcasted/materialized.
; IF-EVL-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP14]] | ||
; IF-EVL-NEXT: [[TMP16]] = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> [[VEC_PHI]], i32 [[TMP10]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regression here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
switch (Opcode) { | ||
default: | ||
return false; | ||
case Instruction::ExtractValue: | ||
return Op == getOperand(1); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Simpler to check for the single opcode we are intersted in with an if
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, update in #145449. Thanks!
b7ec47b
to
b31efdd
Compare
This patch focus on represent the broadcast for the live-in constants explicitly. This can help the VPlan-based cost model the broadcast cost and track the register pressure of the broadcast value in the future.
This patch will not change the generated vector IR and it only changes the output of VPlan.
Note that
materializeBroadcast()
pass will execute after cost model and register pressure model at this moment. So only affect the output ofFinal
plan.