Skip to content

Commit 1b25b97

Browse files
committed
refactor downgrade/CFG removal
Signed-off-by: Dmitry Sidorov <[email protected]>
1 parent 1818b32 commit 1b25b97

File tree

2 files changed

+100
-133
lines changed

2 files changed

+100
-133
lines changed

llvm/lib/SYCLLowerIR/SYCLOptimizeBarriers.cpp

Lines changed: 97 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -84,34 +84,24 @@
8484
// minimal set of barriers required to enforce ordering for the memory
8585
// operations it actually performs.
8686
//
87-
// 4) **CFG-Wide Elimination**
88-
// For each pair of barriers A and B in the function:
89-
// - If A dominates B and B post dominates A and there are no accesses that only B would need to
90-
// order, B can be removed.
91-
// FIXME: The logic shoud actually be:
92-
// a) *Dominator-Based Removal*
93-
// For each pair (A, B) with identical Exec and Mem scopes where A
94-
// dominates B:
95-
// – If *every* path from A to B has no accesses >= A.MemScope, remove
96-
// B.
97-
// - If every path from A to B has no accesses >= A.MemScope, remove B.
98-
// b) *Post-Dominator-Based Removal*
99-
// For each pair (A, B) with identical scopes where B post-dominates A:
100-
// – If *every* path from A to B has no accesses >= A.MemScope, remove
101-
// A.
102-
// - If every path from A to B has no accesses >= A.MemScope, remove A.
87+
// 3) **CFG-Wide Optimization (Dominator/Post-Dominator)**
88+
// Perform barrier analysis across the entire CFG using dominance
89+
// and post-dominance to remove or narrow memory scope and semantic of
90+
// barrier calls:
10391
//
104-
// But there are loops to handle, so simpler logic is used for now.
92+
// a) *Dominator-Based Elimination* — For any two barriers A and B where
93+
// A's ExecScope and MemScope cover B's (i.e., A subsumes B in both
94+
// execution and memory ordering semantics) and A's fence semantics
95+
// include B's, if A dominates B and B post-dominates A, and there are no
96+
// memory accesses at or above the fenced scope on any path between A and
97+
// B, then B is fully redundant and can be removed.
10598
//
106-
// 5) **Global -> Local Downgrade**
107-
// For each global-scope barrier B (MemScope == Device/CrossDevice or
108-
// CrossWorkgroupMemory semantics):
109-
// – If there exists another global barrier A that dominates or
110-
// post-dominates B and no Global/Unknown accesses occur between the two,
111-
// B can be downgraded to Workgroup scope.
112-
// - If there exists another global barrier A that dominates or
113-
// post-dominates B and no Global or Unknown accesses occur between the
114-
// two, B can be downgraded to Workgroup scope.
99+
// b) *Global-to-Local Downgrade* — For barriers that fence global memory
100+
// (Device/CrossDevice or CrossWorkgroupMemory semantics), if another
101+
// global barrier A dominates or post-dominates barrier B with no
102+
// intervening global or unknown accesses, B's MemScope is lowered to
103+
// Workgroup. Their fence semantics are merged so that no ordering
104+
// guarantees are weakened.
115105
//
116106
//===----------------------------------------------------------------------===//
117107

@@ -555,7 +545,7 @@ static bool noFencedAccessesCFG(CallInst *A, CallInst *B,
555545
RegionMemScope Required,
556546
BBMemInfoMap &BBMemInfo) {
557547
LLVM_DEBUG(dbgs() << "Checking for fenced accesses between: " << *A << " and "
558-
<< *B << " in CFG" << "\n");
548+
<< *B << " in CFG" << "\n");
559549
if (Required == RegionMemScope::Unknown)
560550
return false;
561551
// Build the set of blocks that can reach B.
@@ -778,131 +768,109 @@ static bool eliminateBackToBackInBB(BasicBlock *BB,
778768
return Changed;
779769
}
780770

781-
// Remove barriers that are redundant in the CFG based on dominance relations.
782-
static bool eliminateDominatedBarriers(SmallVectorImpl<BarrierDesc *> &Barriers,
783-
DominatorTree &DT,
784-
PostDominatorTree &PDT,
785-
BBMemInfoMap &BBMemInfo) {
771+
// Walk the whole CFG once, first trying to erase fully–redundant
772+
// barriers and, if that is impossible, trying to downgrade
773+
// Cross-work-group barriers that are safely covered by another global fence.
774+
static bool optimizeBarriersCFG(SmallVectorImpl<BarrierDesc *> &Barriers,
775+
DominatorTree &DT, PostDominatorTree &PDT,
776+
BBMemInfoMap &BBMemInfo) {
786777
bool Changed = false;
787-
for (auto *B1 : Barriers) {
788-
if (!B1->CI)
789-
continue;
790-
for (auto *B2 : Barriers) {
791-
// Check if the barrier was already removed.
792-
if (B1 == B2 || !B2->CI)
793-
continue;
794778

795-
// Skip if scopes are unknown or B1 does not enforce at least the
796-
// semantics of B2.
797-
if (B1->ExecScope == Scope::Unknown || B1->MemScope == Scope::Unknown ||
798-
B2->ExecScope == Scope::Unknown || B2->MemScope == Scope::Unknown)
799-
continue;
800-
auto ExecCmp = compareScopesWithWeights(B1->ExecScope, B2->ExecScope);
801-
auto MemCmp = compareScopesWithWeights(B1->MemScope, B2->MemScope);
802-
if (ExecCmp == CompareRes::UNKNOWN || MemCmp == CompareRes::UNKNOWN)
803-
continue;
804-
bool ExecSubsumes =
805-
ExecCmp == CompareRes::BIGGER || ExecCmp == CompareRes::EQUAL;
806-
bool MemSubsumes =
807-
MemCmp == CompareRes::BIGGER || MemCmp == CompareRes::EQUAL;
808-
bool SemSubsumes = (B1->Semantic & B2->Semantic) == B2->Semantic;
779+
for (BarrierDesc *B : Barriers) {
780+
if (!B->CI)
781+
continue; // Already removed
809782

810-
if (!ExecSubsumes || !MemSubsumes || !SemSubsumes)
811-
continue;
783+
bool Removed = false;
784+
bool IsGlobalB =
785+
(B->MemScope == Scope::Device || B->MemScope == Scope::CrossDevice ||
786+
(B->Semantic &
787+
static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory)));
788+
BarrierDesc *DowngradeCand = nullptr;
812789

813-
RegionMemScope Fence = getBarrierFencedScope(*B1);
814-
if (Fence == RegionMemScope::Unknown)
790+
for (BarrierDesc *A : Barriers) {
791+
if (A == B || !A->CI)
815792
continue;
816793

817-
// FIXME: missing optimization, see the header comment. For now live
818-
// with the simpler logic.
819-
if (DT.dominates(B1->CI, B2->CI) && PDT.dominates(B2->CI, B1->CI))
820-
if (noFencedAccessesCFG(B1->CI, B2->CI, Fence, BBMemInfo))
821-
Changed |= eraseBarrierWithITT(*B2);
822-
}
823-
}
824-
return Changed;
825-
}
826-
827-
// Downgrade global barriers to workgroup when no global memory is touched
828-
// before the next global barrier.
829-
static bool downgradeGlobalBarriers(SmallVectorImpl<BarrierDesc *> &Barriers,
830-
DominatorTree &DT, PostDominatorTree &PDT,
831-
BBMemInfoMap &BBMemInfo) {
832-
bool Changed = false;
794+
// Elimination check.
795+
auto ExecCmp = compareScopesWithWeights(A->ExecScope, B->ExecScope);
796+
auto MemCmp = compareScopesWithWeights(A->MemScope, B->MemScope);
797+
bool ScopesCover =
798+
(ExecCmp == CompareRes::BIGGER || ExecCmp == CompareRes::EQUAL) &&
799+
(MemCmp == CompareRes::BIGGER || MemCmp == CompareRes::EQUAL);
800+
bool SemCover = (A->Semantic & B->Semantic) == B->Semantic;
801+
bool ADominatesB = DT.dominates(A->CI, B->CI);
802+
if (ScopesCover && SemCover) {
803+
RegionMemScope Fence = getBarrierFencedScope(*A);
804+
// FIXME: this check is way too conservative.
805+
if (Fence != RegionMemScope::Unknown && ADominatesB &&
806+
PDT.dominates(B->CI, A->CI) &&
807+
noFencedAccessesCFG(A->CI, B->CI, Fence, BBMemInfo)) {
808+
Changed |= eraseBarrierWithITT(*B);
809+
Removed = true;
810+
break;
811+
}
812+
}
833813

834-
// Identify a global barrier: either SPIR-V Device/CrossDevice scope
835-
// or has the CrossWorkgroupMemory bit.
836-
auto IsGlobalBarrier = [](const BarrierDesc &BD) {
837-
return BD.MemScope == Scope::Device || BD.MemScope == Scope::CrossDevice ||
838-
(BD.Semantic &
839-
static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory));
840-
};
814+
// Downgrade check.
815+
if (!Removed && IsGlobalB && !DowngradeCand) {
816+
bool IsGlobalA =
817+
(A->MemScope == Scope::Device ||
818+
A->MemScope == Scope::CrossDevice ||
819+
(A->Semantic &
820+
static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory)));
821+
if (IsGlobalA) {
822+
if (DT.dominates(A->CI, B->CI) &&
823+
noFencedAccessesCFG(A->CI, B->CI, RegionMemScope::Global,
824+
BBMemInfo)) {
825+
DowngradeCand = A;
826+
} else if (PDT.dominates(A->CI, B->CI) &&
827+
noFencedAccessesCFG(B->CI, A->CI, RegionMemScope::Global,
828+
BBMemInfo)) {
829+
DowngradeCand = A;
830+
}
831+
}
832+
}
833+
}
841834

842-
for (auto *BPtr : Barriers) {
843-
BarrierDesc &B = *BPtr;
844-
if (!B.CI || !IsGlobalBarrier(B))
845-
continue;
846-
if (B.ExecScope == Scope::Unknown || B.MemScope == Scope::Unknown)
835+
if (Removed)
847836
continue;
848837

849-
// Look for an earlier barrier A that completely subsumes B:
850-
// A must dominate or post-dominates B, with no intervening global
851-
// accesses. A must itself be a global barrier.
852-
for (auto *APtr : Barriers) {
853-
if (APtr == BPtr)
854-
continue;
855-
BarrierDesc &A = *APtr;
856-
if (!A.CI)
857-
continue;
858-
859-
bool CanDowngrade = false;
860-
// A strictly dominates B.
861-
if (DT.dominates(A.CI, B.CI) &&
862-
noFencedAccessesCFG(A.CI, B.CI, RegionMemScope::Global, BBMemInfo)) {
863-
CanDowngrade = true;
864-
}
865-
// or A post-dominates B block.
866-
else if (PDT.dominates(A.CI, B.CI) &&
867-
noFencedAccessesCFG(B.CI, A.CI, RegionMemScope::Global,
868-
BBMemInfo)) {
869-
CanDowngrade = true;
870-
}
871-
if (!CanDowngrade)
872-
continue;
873-
874-
// Merge ordering semantics so we never weaken A joint B fence.
875-
uint32_t MergedSem = mergeSemantics(A.Semantic, B.Semantic);
876-
LLVMContext &Ctx = B.CI->getContext();
838+
if (DowngradeCand) {
839+
BarrierDesc &A = *DowngradeCand;
840+
BarrierDesc &R = *B;
841+
uint32_t mergedSem = mergeSemantics(A.Semantic, R.Semantic);
842+
LLVMContext &Ctx = R.CI->getContext();
877843
const bool IsControlBarrier =
878-
B.CI->getCalledFunction()->getName() == CONTROL_BARRIER;
844+
R.CI->getCalledFunction()->getName() == CONTROL_BARRIER;
879845
Type *Int32Ty = Type::getInt32Ty(Ctx);
880-
if (MergedSem != B.Semantic) {
881-
B.CI->setArgOperand(IsControlBarrier ? 2 : 1,
882-
ConstantInt::get(Int32Ty, MergedSem));
883-
B.Semantic = MergedSem;
846+
847+
// Merge ordering semantics.
848+
if (mergedSem != R.Semantic) {
849+
R.CI->setArgOperand(IsControlBarrier ? 2 : 1,
850+
ConstantInt::get(Int32Ty, mergedSem));
851+
R.Semantic = mergedSem;
884852
}
885853

886-
// Downgrade memory semantics: CrossWorkgroup -> Workgroup.
854+
// Downgrade CrossWorkgroup -> Workgroup semantics.
887855
const uint32_t CrossMask =
888856
static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory);
889-
if (B.Semantic & CrossMask) {
857+
if (R.Semantic & CrossMask) {
890858
uint32_t NewSem =
891-
(B.Semantic & ~CrossMask) |
859+
(R.Semantic & ~CrossMask) |
892860
static_cast<uint32_t>(MemorySemantics::WorkgroupMemory);
893-
B.CI->setArgOperand(IsControlBarrier ? 2 : 1,
861+
R.CI->setArgOperand(IsControlBarrier ? 2 : 1,
894862
ConstantInt::get(Int32Ty, NewSem));
895-
B.Semantic = NewSem;
863+
R.Semantic = NewSem;
896864
}
897-
LLVM_DEBUG(dbgs() << "Downgrade global barrier: " << *B.CI << "\n");
898-
// Lower the SPIR-V memory-scope operand to Workgroup.
899-
B.CI->setArgOperand(
865+
866+
// Lower the SPIR-V MemScope operand to Workgroup.
867+
R.CI->setArgOperand(
900868
IsControlBarrier ? 1 : 0,
901869
ConstantInt::get(Int32Ty, static_cast<uint32_t>(Scope::Workgroup)));
902-
B.MemScope = Scope::Workgroup;
870+
R.MemScope = Scope::Workgroup;
903871

872+
LLVM_DEBUG(dbgs() << "Downgraded global barrier: " << *R.CI << "\n");
904873
Changed = true;
905-
break;
906874
}
907875
}
908876

@@ -1005,8 +973,7 @@ PreservedAnalyses SYCLOptimizeBarriersPass::run(Function &F,
1005973
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
1006974
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
1007975

1008-
Changed |= eliminateDominatedBarriers(BarrierPtrs, DT, PDT, BBMemInfo);
1009-
Changed |= downgradeGlobalBarriers(BarrierPtrs, DT, PDT, BBMemInfo);
976+
Changed |= optimizeBarriersCFG(BarrierPtrs, DT, PDT, BBMemInfo);
1010977

1011978
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
1012979
}

llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-back-to-back-barrier.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@ define spir_kernel void @_Z3fooii(i32 %0, i32 %1) {
2121
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 64, i32 noundef 2, i32 noundef 400)
2222
; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper()
2323
; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper()
24-
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP0]], i32 noundef 2, i32 noundef 912)
24+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP0]], i32 noundef 2, i32 noundef 400)
2525
; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper()
2626
; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper()
27-
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP0]], i32 noundef 2, i32 noundef 912)
27+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP0]], i32 noundef 2, i32 noundef 400)
2828
; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper()
2929
; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper()
30-
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP1]], i32 noundef 2, i32 noundef 912)
30+
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP1]], i32 noundef 2, i32 noundef 400)
3131
; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper()
3232
; CHECK-NEXT: ret void
3333
;

0 commit comments

Comments
 (0)