|
84 | 84 | // minimal set of barriers required to enforce ordering for the memory
|
85 | 85 | // operations it actually performs.
|
86 | 86 | //
|
87 |
| -// 4) **CFG-Wide Elimination** |
88 |
| -// For each pair of barriers A and B in the function: |
89 |
| -// - If A dominates B and B post dominates A and there are no accesses that only B would need to |
90 |
| -// order, B can be removed. |
91 |
| -// FIXME: The logic shoud actually be: |
92 |
| -// a) *Dominator-Based Removal* |
93 |
| -// For each pair (A, B) with identical Exec and Mem scopes where A |
94 |
| -// dominates B: |
95 |
| -// – If *every* path from A to B has no accesses >= A.MemScope, remove |
96 |
| -// B. |
97 |
| -// - If every path from A to B has no accesses >= A.MemScope, remove B. |
98 |
| -// b) *Post-Dominator-Based Removal* |
99 |
| -// For each pair (A, B) with identical scopes where B post-dominates A: |
100 |
| -// – If *every* path from A to B has no accesses >= A.MemScope, remove |
101 |
| -// A. |
102 |
| -// - If every path from A to B has no accesses >= A.MemScope, remove A. |
| 87 | +// 3) **CFG-Wide Optimization (Dominator/Post-Dominator)** |
| 88 | +// Perform barrier analysis across the entire CFG using dominance |
| 89 | +// and post-dominance to remove or narrow memory scope and semantic of |
| 90 | +// barrier calls: |
103 | 91 | //
|
104 |
| -// But there are loops to handle, so simpler logic is used for now. |
| 92 | +// a) *Dominator-Based Elimination* — For any two barriers A and B where |
| 93 | +// A's ExecScope and MemScope cover B's (i.e., A subsumes B in both |
| 94 | +// execution and memory ordering semantics) and A's fence semantics |
| 95 | +// include B's, if A dominates B and B post-dominates A, and there are no |
| 96 | +// memory accesses at or above the fenced scope on any path between A and |
| 97 | +// B, then B is fully redundant and can be removed. |
105 | 98 | //
|
106 |
| -// 5) **Global -> Local Downgrade** |
107 |
| -// For each global-scope barrier B (MemScope == Device/CrossDevice or |
108 |
| -// CrossWorkgroupMemory semantics): |
109 |
| -// – If there exists another global barrier A that dominates or |
110 |
| -// post-dominates B and no Global/Unknown accesses occur between the two, |
111 |
| -// B can be downgraded to Workgroup scope. |
112 |
| -// - If there exists another global barrier A that dominates or |
113 |
| -// post-dominates B and no Global or Unknown accesses occur between the |
114 |
| -// two, B can be downgraded to Workgroup scope. |
| 99 | +// b) *Global-to-Local Downgrade* — For barriers that fence global memory |
| 100 | +// (Device/CrossDevice or CrossWorkgroupMemory semantics), if another |
| 101 | +// global barrier A dominates or post-dominates barrier B with no |
| 102 | +// intervening global or unknown accesses, B's MemScope is lowered to |
| 103 | +// Workgroup. Their fence semantics are merged so that no ordering |
| 104 | +// guarantees are weakened. |
115 | 105 | //
|
116 | 106 | //===----------------------------------------------------------------------===//
|
117 | 107 |
|
@@ -555,7 +545,7 @@ static bool noFencedAccessesCFG(CallInst *A, CallInst *B,
|
555 | 545 | RegionMemScope Required,
|
556 | 546 | BBMemInfoMap &BBMemInfo) {
|
557 | 547 | LLVM_DEBUG(dbgs() << "Checking for fenced accesses between: " << *A << " and "
|
558 |
| - << *B << " in CFG" << "\n"); |
| 548 | + << *B << " in CFG" << "\n"); |
559 | 549 | if (Required == RegionMemScope::Unknown)
|
560 | 550 | return false;
|
561 | 551 | // Build the set of blocks that can reach B.
|
@@ -778,131 +768,109 @@ static bool eliminateBackToBackInBB(BasicBlock *BB,
|
778 | 768 | return Changed;
|
779 | 769 | }
|
780 | 770 |
|
781 |
| -// Remove barriers that are redundant in the CFG based on dominance relations. |
782 |
| -static bool eliminateDominatedBarriers(SmallVectorImpl<BarrierDesc *> &Barriers, |
783 |
| - DominatorTree &DT, |
784 |
| - PostDominatorTree &PDT, |
785 |
| - BBMemInfoMap &BBMemInfo) { |
| 771 | +// Walk the whole CFG once, first trying to erase fully–redundant |
| 772 | +// barriers and, if that is impossible, trying to downgrade |
| 773 | +// Cross-work-group barriers that are safely covered by another global fence. |
| 774 | +static bool optimizeBarriersCFG(SmallVectorImpl<BarrierDesc *> &Barriers, |
| 775 | + DominatorTree &DT, PostDominatorTree &PDT, |
| 776 | + BBMemInfoMap &BBMemInfo) { |
786 | 777 | bool Changed = false;
|
787 |
| - for (auto *B1 : Barriers) { |
788 |
| - if (!B1->CI) |
789 |
| - continue; |
790 |
| - for (auto *B2 : Barriers) { |
791 |
| - // Check if the barrier was already removed. |
792 |
| - if (B1 == B2 || !B2->CI) |
793 |
| - continue; |
794 | 778 |
|
795 |
| - // Skip if scopes are unknown or B1 does not enforce at least the |
796 |
| - // semantics of B2. |
797 |
| - if (B1->ExecScope == Scope::Unknown || B1->MemScope == Scope::Unknown || |
798 |
| - B2->ExecScope == Scope::Unknown || B2->MemScope == Scope::Unknown) |
799 |
| - continue; |
800 |
| - auto ExecCmp = compareScopesWithWeights(B1->ExecScope, B2->ExecScope); |
801 |
| - auto MemCmp = compareScopesWithWeights(B1->MemScope, B2->MemScope); |
802 |
| - if (ExecCmp == CompareRes::UNKNOWN || MemCmp == CompareRes::UNKNOWN) |
803 |
| - continue; |
804 |
| - bool ExecSubsumes = |
805 |
| - ExecCmp == CompareRes::BIGGER || ExecCmp == CompareRes::EQUAL; |
806 |
| - bool MemSubsumes = |
807 |
| - MemCmp == CompareRes::BIGGER || MemCmp == CompareRes::EQUAL; |
808 |
| - bool SemSubsumes = (B1->Semantic & B2->Semantic) == B2->Semantic; |
| 779 | + for (BarrierDesc *B : Barriers) { |
| 780 | + if (!B->CI) |
| 781 | + continue; // Already removed |
809 | 782 |
|
810 |
| - if (!ExecSubsumes || !MemSubsumes || !SemSubsumes) |
811 |
| - continue; |
| 783 | + bool Removed = false; |
| 784 | + bool IsGlobalB = |
| 785 | + (B->MemScope == Scope::Device || B->MemScope == Scope::CrossDevice || |
| 786 | + (B->Semantic & |
| 787 | + static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory))); |
| 788 | + BarrierDesc *DowngradeCand = nullptr; |
812 | 789 |
|
813 |
| - RegionMemScope Fence = getBarrierFencedScope(*B1); |
814 |
| - if (Fence == RegionMemScope::Unknown) |
| 790 | + for (BarrierDesc *A : Barriers) { |
| 791 | + if (A == B || !A->CI) |
815 | 792 | continue;
|
816 | 793 |
|
817 |
| - // FIXME: missing optimization, see the header comment. For now live |
818 |
| - // with the simpler logic. |
819 |
| - if (DT.dominates(B1->CI, B2->CI) && PDT.dominates(B2->CI, B1->CI)) |
820 |
| - if (noFencedAccessesCFG(B1->CI, B2->CI, Fence, BBMemInfo)) |
821 |
| - Changed |= eraseBarrierWithITT(*B2); |
822 |
| - } |
823 |
| - } |
824 |
| - return Changed; |
825 |
| -} |
826 |
| - |
827 |
| -// Downgrade global barriers to workgroup when no global memory is touched |
828 |
| -// before the next global barrier. |
829 |
| -static bool downgradeGlobalBarriers(SmallVectorImpl<BarrierDesc *> &Barriers, |
830 |
| - DominatorTree &DT, PostDominatorTree &PDT, |
831 |
| - BBMemInfoMap &BBMemInfo) { |
832 |
| - bool Changed = false; |
| 794 | + // Elimination check. |
| 795 | + auto ExecCmp = compareScopesWithWeights(A->ExecScope, B->ExecScope); |
| 796 | + auto MemCmp = compareScopesWithWeights(A->MemScope, B->MemScope); |
| 797 | + bool ScopesCover = |
| 798 | + (ExecCmp == CompareRes::BIGGER || ExecCmp == CompareRes::EQUAL) && |
| 799 | + (MemCmp == CompareRes::BIGGER || MemCmp == CompareRes::EQUAL); |
| 800 | + bool SemCover = (A->Semantic & B->Semantic) == B->Semantic; |
| 801 | + bool ADominatesB = DT.dominates(A->CI, B->CI); |
| 802 | + if (ScopesCover && SemCover) { |
| 803 | + RegionMemScope Fence = getBarrierFencedScope(*A); |
| 804 | + // FIXME: this check is way too conservative. |
| 805 | + if (Fence != RegionMemScope::Unknown && ADominatesB && |
| 806 | + PDT.dominates(B->CI, A->CI) && |
| 807 | + noFencedAccessesCFG(A->CI, B->CI, Fence, BBMemInfo)) { |
| 808 | + Changed |= eraseBarrierWithITT(*B); |
| 809 | + Removed = true; |
| 810 | + break; |
| 811 | + } |
| 812 | + } |
833 | 813 |
|
834 |
| - // Identify a global barrier: either SPIR-V Device/CrossDevice scope |
835 |
| - // or has the CrossWorkgroupMemory bit. |
836 |
| - auto IsGlobalBarrier = [](const BarrierDesc &BD) { |
837 |
| - return BD.MemScope == Scope::Device || BD.MemScope == Scope::CrossDevice || |
838 |
| - (BD.Semantic & |
839 |
| - static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory)); |
840 |
| - }; |
| 814 | + // Downgrade check. |
| 815 | + if (!Removed && IsGlobalB && !DowngradeCand) { |
| 816 | + bool IsGlobalA = |
| 817 | + (A->MemScope == Scope::Device || |
| 818 | + A->MemScope == Scope::CrossDevice || |
| 819 | + (A->Semantic & |
| 820 | + static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory))); |
| 821 | + if (IsGlobalA) { |
| 822 | + if (DT.dominates(A->CI, B->CI) && |
| 823 | + noFencedAccessesCFG(A->CI, B->CI, RegionMemScope::Global, |
| 824 | + BBMemInfo)) { |
| 825 | + DowngradeCand = A; |
| 826 | + } else if (PDT.dominates(A->CI, B->CI) && |
| 827 | + noFencedAccessesCFG(B->CI, A->CI, RegionMemScope::Global, |
| 828 | + BBMemInfo)) { |
| 829 | + DowngradeCand = A; |
| 830 | + } |
| 831 | + } |
| 832 | + } |
| 833 | + } |
841 | 834 |
|
842 |
| - for (auto *BPtr : Barriers) { |
843 |
| - BarrierDesc &B = *BPtr; |
844 |
| - if (!B.CI || !IsGlobalBarrier(B)) |
845 |
| - continue; |
846 |
| - if (B.ExecScope == Scope::Unknown || B.MemScope == Scope::Unknown) |
| 835 | + if (Removed) |
847 | 836 | continue;
|
848 | 837 |
|
849 |
| - // Look for an earlier barrier A that completely subsumes B: |
850 |
| - // A must dominate or post-dominates B, with no intervening global |
851 |
| - // accesses. A must itself be a global barrier. |
852 |
| - for (auto *APtr : Barriers) { |
853 |
| - if (APtr == BPtr) |
854 |
| - continue; |
855 |
| - BarrierDesc &A = *APtr; |
856 |
| - if (!A.CI) |
857 |
| - continue; |
858 |
| - |
859 |
| - bool CanDowngrade = false; |
860 |
| - // A strictly dominates B. |
861 |
| - if (DT.dominates(A.CI, B.CI) && |
862 |
| - noFencedAccessesCFG(A.CI, B.CI, RegionMemScope::Global, BBMemInfo)) { |
863 |
| - CanDowngrade = true; |
864 |
| - } |
865 |
| - // or A post-dominates B block. |
866 |
| - else if (PDT.dominates(A.CI, B.CI) && |
867 |
| - noFencedAccessesCFG(B.CI, A.CI, RegionMemScope::Global, |
868 |
| - BBMemInfo)) { |
869 |
| - CanDowngrade = true; |
870 |
| - } |
871 |
| - if (!CanDowngrade) |
872 |
| - continue; |
873 |
| - |
874 |
| - // Merge ordering semantics so we never weaken A joint B fence. |
875 |
| - uint32_t MergedSem = mergeSemantics(A.Semantic, B.Semantic); |
876 |
| - LLVMContext &Ctx = B.CI->getContext(); |
| 838 | + if (DowngradeCand) { |
| 839 | + BarrierDesc &A = *DowngradeCand; |
| 840 | + BarrierDesc &R = *B; |
| 841 | + uint32_t mergedSem = mergeSemantics(A.Semantic, R.Semantic); |
| 842 | + LLVMContext &Ctx = R.CI->getContext(); |
877 | 843 | const bool IsControlBarrier =
|
878 |
| - B.CI->getCalledFunction()->getName() == CONTROL_BARRIER; |
| 844 | + R.CI->getCalledFunction()->getName() == CONTROL_BARRIER; |
879 | 845 | Type *Int32Ty = Type::getInt32Ty(Ctx);
|
880 |
| - if (MergedSem != B.Semantic) { |
881 |
| - B.CI->setArgOperand(IsControlBarrier ? 2 : 1, |
882 |
| - ConstantInt::get(Int32Ty, MergedSem)); |
883 |
| - B.Semantic = MergedSem; |
| 846 | + |
| 847 | + // Merge ordering semantics. |
| 848 | + if (mergedSem != R.Semantic) { |
| 849 | + R.CI->setArgOperand(IsControlBarrier ? 2 : 1, |
| 850 | + ConstantInt::get(Int32Ty, mergedSem)); |
| 851 | + R.Semantic = mergedSem; |
884 | 852 | }
|
885 | 853 |
|
886 |
| - // Downgrade memory semantics: CrossWorkgroup -> Workgroup. |
| 854 | + // Downgrade CrossWorkgroup -> Workgroup semantics. |
887 | 855 | const uint32_t CrossMask =
|
888 | 856 | static_cast<uint32_t>(MemorySemantics::CrossWorkgroupMemory);
|
889 |
| - if (B.Semantic & CrossMask) { |
| 857 | + if (R.Semantic & CrossMask) { |
890 | 858 | uint32_t NewSem =
|
891 |
| - (B.Semantic & ~CrossMask) | |
| 859 | + (R.Semantic & ~CrossMask) | |
892 | 860 | static_cast<uint32_t>(MemorySemantics::WorkgroupMemory);
|
893 |
| - B.CI->setArgOperand(IsControlBarrier ? 2 : 1, |
| 861 | + R.CI->setArgOperand(IsControlBarrier ? 2 : 1, |
894 | 862 | ConstantInt::get(Int32Ty, NewSem));
|
895 |
| - B.Semantic = NewSem; |
| 863 | + R.Semantic = NewSem; |
896 | 864 | }
|
897 |
| - LLVM_DEBUG(dbgs() << "Downgrade global barrier: " << *B.CI << "\n"); |
898 |
| - // Lower the SPIR-V memory-scope operand to Workgroup. |
899 |
| - B.CI->setArgOperand( |
| 865 | + |
| 866 | + // Lower the SPIR-V MemScope operand to Workgroup. |
| 867 | + R.CI->setArgOperand( |
900 | 868 | IsControlBarrier ? 1 : 0,
|
901 | 869 | ConstantInt::get(Int32Ty, static_cast<uint32_t>(Scope::Workgroup)));
|
902 |
| - B.MemScope = Scope::Workgroup; |
| 870 | + R.MemScope = Scope::Workgroup; |
903 | 871 |
|
| 872 | + LLVM_DEBUG(dbgs() << "Downgraded global barrier: " << *R.CI << "\n"); |
904 | 873 | Changed = true;
|
905 |
| - break; |
906 | 874 | }
|
907 | 875 | }
|
908 | 876 |
|
@@ -1005,8 +973,7 @@ PreservedAnalyses SYCLOptimizeBarriersPass::run(Function &F,
|
1005 | 973 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
1006 | 974 | PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
|
1007 | 975 |
|
1008 |
| - Changed |= eliminateDominatedBarriers(BarrierPtrs, DT, PDT, BBMemInfo); |
1009 |
| - Changed |= downgradeGlobalBarriers(BarrierPtrs, DT, PDT, BBMemInfo); |
| 976 | + Changed |= optimizeBarriersCFG(BarrierPtrs, DT, PDT, BBMemInfo); |
1010 | 977 |
|
1011 | 978 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
|
1012 | 979 | }
|
0 commit comments