diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index e4474a0156027..8928fbbcd009a 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -52,6 +52,7 @@ #include "llvm/SYCLLowerIR/SYCLAddOptLevelAttribute.h" #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h" #include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h" +#include "llvm/SYCLLowerIR/SYCLOptimizeBarriers.h" #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h" #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h" #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h" @@ -1096,6 +1097,16 @@ void EmitAssemblyHelper::RunOptimizationPipeline( }); } + // Add SYCLOptimizeBarriers pass for SYCL device code. + if (LangOpts.SYCLIsDevice) { + PB.registerOptimizerLastEPCallback([](ModulePassManager &MPM, + OptimizationLevel Level, + ThinOrFullLTOPhase) { + MPM.addPass( + createModuleToFunctionPassAdaptor(SYCLOptimizeBarriersPass())); + }); + } + const bool PrepareForThinLTO = CodeGenOpts.PrepareForThinLTO; const bool PrepareForLTO = CodeGenOpts.PrepareForLTO; diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h b/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBarriers.h similarity index 52% rename from llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h rename to llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBarriers.h index 7ea93f928d4c2..0adb1e2fe3612 100644 --- a/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h +++ b/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBarriers.h @@ -1,4 +1,4 @@ -//==- SYCLOptimizeBackToBackBarrier.h - SYCLOptimizeBackToBackBarrier Pass -==// +//==- SYCLOptimizeBarriers.h - SYCLOptimizeBarriers Pass -==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,24 +6,24 @@ // //===----------------------------------------------------------------------===// // -// This pass cleans up back-to-back ControlBarrier calls. +// This pass cleans up ControlBarrier and MemoryBarrier calls. // //===----------------------------------------------------------------------===// -#ifndef LLVM_SYCL_OPTIMIZE_BACK_TO_BACK_BARRIER_H -#define LLVM_SYCL_OPTIMIZE_BACK_TO_BACK_BARRIER_H +#ifndef LLVM_SYCL_OPTIMIZE_BARRIERS_H +#define LLVM_SYCL_OPTIMIZE_BARRIERS_H #include "llvm/IR/PassManager.h" namespace llvm { -class SYCLOptimizeBackToBackBarrierPass - : public PassInfoMixin { +class SYCLOptimizeBarriersPass + : public PassInfoMixin { public: - PreservedAnalyses run(Module &M, ModuleAnalysisManager &); + PreservedAnalyses run(Function &F, FunctionAnalysisManager &); static bool isRequired() { return true; } }; } // namespace llvm -#endif // LLVM_SYCL_OPTIMIZE_BACK_TO_BACK_BARRIER_H +#endif // LLVM_SYCL_OPTIMIZE_BARRIERS_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 227afecb5daca..ef48663a4bdb6 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -199,7 +199,7 @@ #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h" #include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h" #include "llvm/SYCLLowerIR/SYCLJointMatrixTransform.h" -#include "llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h" +#include "llvm/SYCLLowerIR/SYCLOptimizeBarriers.h" #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h" #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h" #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 2cf4bd1e3a0bd..a84287073acca 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -179,7 +179,6 @@ MODULE_PASS("esimd-remove-host-code", ESIMDRemoveHostCodePass()); MODULE_PASS("esimd-remove-optnone-noinline", ESIMDRemoveOptnoneNoinlinePass()); MODULE_PASS("sycl-conditional-call-on-device", SYCLConditionalCallOnDevicePass()) MODULE_PASS("sycl-joint-matrix-transform", SYCLJointMatrixTransformPass()) -MODULE_PASS("sycl-optimize-back-to-back-barrier", SYCLOptimizeBackToBackBarrierPass()) MODULE_PASS("sycl-propagate-aspects-usage", SYCLPropagateAspectsUsagePass()) MODULE_PASS("sycl-propagate-joint-matrix-usage", SYCLPropagateJointMatrixUsagePass()) MODULE_PASS("sycl-add-opt-level-attribute", SYCLAddOptLevelAttributePass()) @@ -507,6 +506,7 @@ FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass()) FUNCTION_PASS("slsr", StraightLineStrengthReducePass()) FUNCTION_PASS("stack-protector", StackProtectorPass(TM)) FUNCTION_PASS("strip-gc-relocates", StripGCRelocates()) +FUNCTION_PASS("sycl-optimize-barriers", SYCLOptimizeBarriersPass()) FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass()) FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass()) diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 4576066584acf..d643840cb274f 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -74,7 +74,7 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLDeviceRequirements.cpp SYCLKernelParamOptInfo.cpp SYCLJointMatrixTransform.cpp - SYCLOptimizeBackToBackBarrier.cpp + SYCLOptimizeBarriers.cpp SYCLPropagateAspectsUsage.cpp SYCLPropagateJointMatrixUsage.cpp SYCLVirtualFunctionsAnalysis.cpp diff --git a/llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp b/llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp deleted file mode 100644 index e7973dd48212f..0000000000000 --- a/llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp +++ /dev/null @@ -1,160 +0,0 @@ -//=== SYCLOptimizeBackToBackBarrier.cpp - SYCL barrier optimization pass ===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass cleans up back-to-back ControlBarrier calls. -// -//===----------------------------------------------------------------------===// - -#include "llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h" - -#include "llvm/IR/IRBuilder.h" - -using namespace llvm; - -namespace { - -static constexpr char CONTROL_BARRIER[] = "_Z22__spirv_ControlBarrieriii"; -static constexpr char ITT_BARRIER[] = "__itt_offload_wg_barrier_wrapper"; -static constexpr char ITT_RESUME[] = "__itt_offload_wi_resume_wrapper"; - -// Known scopes in SPIR-V. -enum class Scope { - CrossDevice = 0, - Device = 1, - Workgroup = 2, - Subgroup = 3, - Invocation = 4 -}; - -enum class CompareRes { BIGGER = 0, SMALLER = 1, EQUAL = 2, UNKNOWN = 3 }; - -// This map is added in case of any future scopes are added to SPIR-V and/or -// SYCL. -const std::unordered_map ScopeWeights = { - {static_cast(Scope::CrossDevice), 1000}, - {static_cast(Scope::Device), 800}, - {static_cast(Scope::Workgroup), 600}, - {static_cast(Scope::Subgroup), 400}, - {static_cast(Scope::Invocation), 10}}; - -inline CompareRes compareScopesWithWeights(const uint64_t LHS, - const uint64_t RHS) { - auto LHSIt = ScopeWeights.find(LHS); - auto RHSIt = ScopeWeights.find(RHS); - - if (LHSIt == ScopeWeights.end() || RHSIt == ScopeWeights.end()) - return CompareRes::UNKNOWN; - - const uint64_t LHSWeight = LHSIt->second; - const uint64_t RHSWeight = RHSIt->second; - - if (LHSWeight > RHSWeight) - return CompareRes::BIGGER; - if (LHSWeight < RHSWeight) - return CompareRes::SMALLER; - return CompareRes::EQUAL; -} - -// The function removes back-to-back ControlBarrier calls in case if they -// have the same memory scope and memory semantics arguments. When two -// back-to-back ControlBarriers are having different execution scope arguments - -// pick the one with the 'bigger' scope. -// It also cleans up ITT annotations surrounding the removed barrier call. -bool processControlBarrier(Function *F) { - BasicBlock *PrevBB = nullptr; - llvm::SmallPtrSet ToErase; - for (auto I = F->user_begin(), E = F->user_end(); I != E;) { - User *U = *I++; - auto *CI = dyn_cast(U); - if (!CI) - continue; - - // New basic block - new processing. - BasicBlock *CurrentBB = CI->getParent(); - if (CurrentBB != PrevBB) { - PrevBB = CurrentBB; - continue; - } - - llvm::SmallPtrSet ToEraseLocalITT; - BasicBlock::iterator It(CI); - // Iterate over the basic block storing back-to-back barriers and their ITT - // annotations into ToErase container. - while (It != CurrentBB->begin()) { - --It; - auto *Cand = dyn_cast(&*It); - if (!Cand) - break; - CallInst *CIToRemove = Cand; - StringRef CandName = Cand->getCalledFunction()->getName(); - if (CandName == ITT_RESUME || CandName == ITT_BARRIER) { - ToEraseLocalITT.insert(Cand); - continue; - } else if (CandName == CONTROL_BARRIER) { - bool EqualOps = true; - const auto *ExecutionScopeCI = CI->getOperand(0); - const auto *ExecutionScopeCand = Cand->getOperand(0); - if (ExecutionScopeCI != ExecutionScopeCand) { - if (isa(ExecutionScopeCI) && - isa(ExecutionScopeCand)) { - const auto ConstScopeCI = - cast(ExecutionScopeCI)->getZExtValue(); - const auto ConstScopeCand = - cast(ExecutionScopeCand)->getZExtValue(); - // Pick ControlBarrier with the 'bigger' execution scope. - const auto Compare = - compareScopesWithWeights(ConstScopeCI, ConstScopeCand); - if (Compare == CompareRes::SMALLER) - CIToRemove = CI; - else if (Compare == CompareRes::UNKNOWN) - // Unknown scopes = unknown rules. Keep ControlBarrier call. - EqualOps = false; - } else - EqualOps = false; - } - // TODO: may be handle a case with not-matching memory scope and - // memory semantic arguments in a smart way. - for (unsigned I = 1; I != CI->getNumOperands(); ++I) { - if (CI->getOperand(I) != Cand->getOperand(I)) { - EqualOps = false; - break; - } - } - if (EqualOps) { - ToErase.insert(CIToRemove); - for (auto *ITT : ToEraseLocalITT) - ToErase.insert(ITT); - ToEraseLocalITT.clear(); - } - } - } - } - - if (ToErase.empty()) - return false; - - for (auto *I : ToErase) { - I->dropAllReferences(); - I->eraseFromParent(); - } - - return true; -} - -} // namespace - -PreservedAnalyses -SYCLOptimizeBackToBackBarrierPass::run(Module &M, ModuleAnalysisManager &MAM) { - bool ModuleChanged = false; - for (Function &F : M) - if (F.isDeclaration()) - if (F.getName() == CONTROL_BARRIER) - ModuleChanged |= processControlBarrier(&F); - - return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} diff --git a/llvm/lib/SYCLLowerIR/SYCLOptimizeBarriers.cpp b/llvm/lib/SYCLLowerIR/SYCLOptimizeBarriers.cpp new file mode 100644 index 0000000000000..5490687b136fc --- /dev/null +++ b/llvm/lib/SYCLLowerIR/SYCLOptimizeBarriers.cpp @@ -0,0 +1,1013 @@ +//==== SYCLOptimizeBarriers.cpp - SYCL barrier optimization pass ====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass optimizes __spirv_ControlBarrier and __spirv_MemoryBarrier calls. +// +// SYCL Barrier-Optimization Pass Overview +// +// 1) **Collect Phase** +// * Walk through the function and record every barrier call into a list of +// BarrierDesc: +// – CI : the call instruction +// – ExecScope : the execution-scope operand +// – MemScope : the memory-scope operand +// – Semantic : the fence-semantics bits +// * At the same time, build a per-BB summary of memory accesses: +// – None : only private/constant or no accesses +// – Local : at least one addrspace(3) access +// – Global : at least one addrspace(1/5/6) access (with an exception of +// loads from __spirv_BuiltIn GVs) +// – Unknown : any other mayReadOrWriteMemory() (intrinsics, calls, +// generic addrspace) +// * Walk the function and record every barrier call into a list of +// BarrierDesc structures. +// * At the same time, build a per-basic block summary of memory accesses: +// - None : only private/constant or no accesses +// - Local : at least one addrspace(3) access +// - Global : at least one addrspace(1/5/6) access (except loads from +// __spirv_BuiltIn globals) +// - Unknown: any other mayReadOrWriteMemory() instruction +// +// 2) **At Entry and At Exit Elimination** +// - **Entry**: For each barrier B, if on *every* path from function entry to +// B there are no accesses to memory region greater than or equal to +// B.MemScope, then remove B. +// - **Exit** : For each barrier B, if on *every* path from B to any function +// return there are no accesses to memory region greater than or equal to +// B.MemScope, then remove B. +// +// 3) **Back-to-Back Elimination (per-BB)** +// a) *Pure-Sync Collapse* +// If BB summary == None (no local/global/unknown accesses): +// – Find the single barrier with the *widest* (ExecScope, MemScope) +// (ignore Unknown). +// – Erase all other barriers (they synchronize nothing). +// If BB summary == None (no local, global or unknown accesses): +// - Find the single barrier with the widest (ExecScope, MemScope) +// ignoring Unknown scopes. +// - Erase all other barriers since they synchronize nothing. +// b) *General Redundancy Check* +// Otherwise we walk the barriers in source order and compare each new +// barrier to the most recent one that is still alive: +// - If they fence the same execution + memory scope and there are no +// accesses that need fencing between them, the later barrier is +// redundant and removed. +// - If the earlier barrier fences a superset of what the later one would +// fence and there are no accesses that only the later barrier would +// need to order, the later barrier is removed. +// - Symmetrically, if the later barrier fences a superset and the +// intervening code contains nothing that only the earlier barrier needed, +// the earlier barrier is removed. +// Any barrier whose execution or memory scope is Unknown is kept +// conservatively. After a single pass every basic block contains only the +// minimal set of barriers required to enforce ordering for the memory +// operations it actually performs. +// +// 3) **CFG-Wide Optimization (Dominator/Post-Dominator)** +// Perform barrier analysis across the entire CFG using dominance +// and post-dominance to remove or narrow memory scope and semantic of +// barrier calls: +// +// a) *Dominator-Based Elimination* — For any two barriers A and B where +// A's ExecScope and MemScope cover B's (i.e., A subsumes B in both +// execution and memory ordering semantics) and A's fence semantics +// include B's, if A dominates B and B post-dominates A, and there are no +// memory accesses at or above the fenced scope on any path between A and +// B, then B is fully redundant and can be removed. +// +// b) *Global-to-Local Downgrade* — For barriers that fence global memory +// (Device/CrossDevice or CrossWorkgroupMemory semantics), if another +// global barrier A dominates or post-dominates barrier B with no +// intervening global or unknown accesses, B's MemScope is lowered to +// Workgroup. Their fence semantics are merged so that no ordering +// guarantees are weakened. +// +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/SYCLOptimizeBarriers.h" + +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include + +using namespace llvm; + +#define DEBUG_TYPE "sycl-opt-barriers" + +namespace { + +// Hard-coded special names used in the pass. +static constexpr char CONTROL_BARRIER[] = "_Z22__spirv_ControlBarrieriii"; +static constexpr char MEMORY_BARRIER[] = "_Z21__spirv_MemoryBarrierii"; +static constexpr char ITT_BARRIER[] = "__itt_offload_wg_barrier_wrapper"; +static constexpr char ITT_RESUME[] = "__itt_offload_wi_resume_wrapper"; +static constexpr char SPIRV_BUILTIN_PREFIX[] = "__spirv_BuiltIn"; + +// Simple enum to capture whether a block has local/global/unknown accesses. +enum class RegionMemScope { + None = 0, + Local = 1, + Global = 2, + Generic = 3, + Unknown = 4 +}; + +// Known address spaces for SPIR target. +enum class SPIRAddrSpaces { + Private = 0, + Global = 1, + Constant = 2, + Local = 3, + Generic = 4, + GlobalDevice = 5, + GlobalHost = 6 +}; + +// Map SPIR-V address spaces to our little RegionMemScope domain. +static const std::unordered_map AddrSpaceMap = { + {static_cast(SPIRAddrSpaces::Private), RegionMemScope::None}, + {static_cast(SPIRAddrSpaces::Constant), RegionMemScope::None}, + + {static_cast(SPIRAddrSpaces::Global), RegionMemScope::Global}, + {static_cast(SPIRAddrSpaces::GlobalDevice), + RegionMemScope::Global}, + {static_cast(SPIRAddrSpaces::GlobalHost), RegionMemScope::Global}, + + {static_cast(SPIRAddrSpaces::Local), RegionMemScope::Local}, + + {static_cast(SPIRAddrSpaces::Generic), RegionMemScope::Generic}, + // any future AS default to Unknown +}; + +// Scopes in SPIR-V. +enum class Scope { + CrossDevice = 0, + Device = 1, + Workgroup = 2, + Subgroup = 3, + Invocation = 4, + Unknown = 10 +}; + +// This enum, map and compare function are added to compare widths of the +// barrier scopes and make pass forward compatible in case if new scopes +// appearing in SPIR-V and/or SYCL. +enum class CompareRes { BIGGER = 0, SMALLER = 1, EQUAL = 2, UNKNOWN = 3 }; + +const std::unordered_map ScopeWeights = { + {Scope::CrossDevice, 1000}, + {Scope::Device, 800}, + {Scope::Workgroup, 600}, + {Scope::Subgroup, 400}, + {Scope::Invocation, 10}}; + +static inline CompareRes compareScopesWithWeights(Scope LHS, Scope RHS) { + auto LHSIt = ScopeWeights.find(LHS); + auto RHSIt = ScopeWeights.find(RHS); + + if (LHSIt == ScopeWeights.end() || RHSIt == ScopeWeights.end()) + return CompareRes::UNKNOWN; + + const uint64_t LHSWeight = LHSIt->second; + const uint64_t RHSWeight = RHSIt->second; + + if (LHSWeight > RHSWeight) + return CompareRes::BIGGER; + if (LHSWeight < RHSWeight) + return CompareRes::SMALLER; + return CompareRes::EQUAL; +} + +enum class MemorySemantics { + SubgroupMemory = 0x80, + WorkgroupMemory = 0x100, + CrossWorkgroupMemory = 0x200 +}; + +enum class Ordering { + Acquire = 0x2, + Release = 0x4, + AcquireRelease = 0x8, + SequentiallyConsistent = 0x10 +}; + +static constexpr uint32_t MemorySemanticMask = ~0x3fu; + +// Normalize a raw 'memory semantics' bitmask to a canonical form. +static inline uint32_t canonicalizeSemantic(uint32_t Sem) { + bool HasAcq = Sem & static_cast(Ordering::Acquire); + bool HasRel = Sem & static_cast(Ordering::Release); + bool HasAcqRel = Sem & static_cast(Ordering::AcquireRelease); + bool HasSeq = Sem & static_cast(Ordering::SequentiallyConsistent); + + if (HasSeq) + Sem &= MemorySemanticMask | + static_cast(Ordering::SequentiallyConsistent); + else { + if (HasAcq && HasRel) + HasAcqRel = true; + if (HasAcqRel) { + Sem &= ~(static_cast(Ordering::Acquire) | + static_cast(Ordering::Release)); + Sem |= static_cast(Ordering::AcquireRelease); + } + } + return Sem; +} + +// Merge two semantics bitmasks into a single canonical form. +static inline uint32_t mergeSemantics(uint32_t A, uint32_t B) { + return canonicalizeSemantic(A | B); +} + +// Return the ordering class of a semantic bitmask. +static inline int orderingClass(uint32_t Sem) { + Sem = canonicalizeSemantic(Sem); + if (Sem & static_cast(Ordering::SequentiallyConsistent)) + return 4; + if (Sem & static_cast(Ordering::AcquireRelease)) + return 3; + if (Sem & static_cast(Ordering::Release)) + return 2; + if (Sem & static_cast(Ordering::Acquire)) + return 1; + return 0; +} + +// Check if A is a superset of B in terms of semantics and ordering. +static inline bool semanticsSuperset(uint32_t A, uint32_t B) { + A = canonicalizeSemantic(A); + B = canonicalizeSemantic(B); + uint32_t AMem = A & MemorySemanticMask; + uint32_t BMem = B & MemorySemanticMask; + if ((BMem & ~AMem) != 0) + return false; + + int AOrd = orderingClass(A); + int BOrd = orderingClass(B); + + if (AOrd == 4) + return true; + if (AOrd == 3) + return BOrd <= 3; + if (AOrd == 2) + return BOrd == 2 || BOrd == 0; + if (AOrd == 1) + return BOrd == 1 || BOrd == 0; + return BOrd == 0; +} + +// Holds everything we know about one barrier invocation. +struct BarrierDesc { + CallInst *CI; + Scope ExecScope; + Scope MemScope; + uint32_t Semantic; +}; + +// Per-BB summary of what kinds of accesses appear. +using BBMemInfoMap = DenseMap; + +// Per-BB summary of Barriers. +using BarriersMap = DenseMap>; + +// Map SPIR-V Barrier Scope to the RegionMemScope that a barrier of that kind +// actually fences. +template +static inline RegionMemScope getBarrierFencedScopeImpl(const BarrierDesc &BD) { + uint32_t Sem = canonicalizeSemantic(BD.Semantic); + constexpr uint32_t LocalMask = + static_cast(MemorySemantics::WorkgroupMemory) | + static_cast(MemorySemantics::SubgroupMemory); + constexpr uint32_t GlobalMask = + static_cast(MemorySemantics::CrossWorkgroupMemory); + + if constexpr (SearchFor == RegionMemScope::Local) { + if (Sem & LocalMask) + return RegionMemScope::Local; + if (Sem & GlobalMask) + return RegionMemScope::Global; + } else { + if (Sem & GlobalMask) + return RegionMemScope::Global; + if (Sem & LocalMask) + return RegionMemScope::Local; + } + + return RegionMemScope::None; +} + +static inline RegionMemScope getBarrierFencedScope(const BarrierDesc &BD) { + return getBarrierFencedScopeImpl(BD); +} +static inline RegionMemScope getBarrierMaxFencedScope(const BarrierDesc &BD) { + return getBarrierFencedScopeImpl(BD); +} + +// Classify a single instruction's memory scope. Used to set/update memory +// scope of a basic block. +static RegionMemScope classifyMemScope(Instruction *I) { + if (CallInst *CI = dyn_cast(I)) { + if (Function *F = CI->getCalledFunction()) { + const StringRef FName = F->getName(); + if (FName == CONTROL_BARRIER || FName == MEMORY_BARRIER || + FName == ITT_BARRIER || FName == ITT_RESUME) + return RegionMemScope::None; + if (FName.contains("__spirv_Atomic")) { + // SPIR-V atomics all have the same signature: + // arg0 = ptr, arg1 = SPIR-V Scope, arg2 = Semantics + auto *ScopeC = dyn_cast(CI->getArgOperand(1)); + auto *SemC = dyn_cast(CI->getArgOperand(2)); + if (!ScopeC || !SemC) + return RegionMemScope::Unknown; + // If the semantics mention CrossWorkgroupMemory, treat as global. + uint32_t SemVal = canonicalizeSemantic(SemC->getZExtValue()); + if (SemVal & (uint32_t)MemorySemantics::CrossWorkgroupMemory) + return RegionMemScope::Global; + if (SemVal & ((uint32_t)MemorySemantics::WorkgroupMemory | + (uint32_t)MemorySemantics::SubgroupMemory)) + return RegionMemScope::Local; + switch (ScopeC->getZExtValue()) { + case static_cast(Scope::CrossDevice): + case static_cast(Scope::Device): + return RegionMemScope::Global; + case static_cast(Scope::Workgroup): + case static_cast(Scope::Subgroup): + return RegionMemScope::Local; + case static_cast(Scope::Invocation): + return RegionMemScope::None; + default: + return RegionMemScope::Unknown; + } + } + // TODO: handle other SPIR-V friendly function calls. + } + } + + // If it doesn't read or write, it doesn't affect the region memory scope. + if (!I->mayReadOrWriteMemory()) + return RegionMemScope::None; + + auto resolveGeneric = [&](Value *Pointer) -> RegionMemScope { + // If generic pointer originates from an alloca instruction within a + // function - it's safe to assume, that it's a private allocation. + // FIXME: use more comprehensive analysis. + Value *Orig = Pointer->stripInBoundsConstantOffsets(); + if (isa(Orig)) + return RegionMemScope::None; + uint32_t AS = cast(Orig->getType())->getAddressSpace(); + auto Pos = AddrSpaceMap.find(AS); + if (Pos == AddrSpaceMap.end()) + return RegionMemScope::Unknown; + return Pos->second == RegionMemScope::Generic ? RegionMemScope::Unknown + : Pos->second; + }; + + auto getScopeForPtr = [&](Value *Ptr, uint32_t AS) -> RegionMemScope { + // Loads from __spirv_BuiltIn GVs are not fenced by barriers. + if (auto *GV = dyn_cast(Ptr)) + if (GV->getName().starts_with(SPIRV_BUILTIN_PREFIX)) + return RegionMemScope::None; + auto Pos = AddrSpaceMap.find(AS); + if (Pos == AddrSpaceMap.end()) + return RegionMemScope::Unknown; + return Pos->second == RegionMemScope::Generic ? resolveGeneric(Ptr) + : Pos->second; + }; + + // Check for memory instructions. + // TODO: check for other intrinsics + if (auto *LD = dyn_cast(I)) + return getScopeForPtr(LD->getPointerOperand(), + LD->getPointerAddressSpace()); + if (auto *ST = dyn_cast(I)) + return getScopeForPtr(ST->getPointerOperand(), + ST->getPointerAddressSpace()); + if (auto *MI = dyn_cast(I)) { + RegionMemScope Scope = + getScopeForPtr(MI->getDest(), MI->getDestAddressSpace()); + + if (auto *MT = dyn_cast(MI)) { + RegionMemScope SrcScope = + getScopeForPtr(MT->getSource(), MT->getSourceAddressSpace()); + Scope = std::max(Scope, SrcScope); + } + return Scope; + } + if (isa(I)) + return RegionMemScope::Global; + + if (auto *RMW = dyn_cast(I)) + return getScopeForPtr(RMW->getPointerOperand(), + RMW->getPointerAddressSpace()); + if (auto *CompEx = dyn_cast(I)) + return getScopeForPtr(CompEx->getPointerOperand(), + CompEx->getPointerAddressSpace()); + + return RegionMemScope::Unknown; +} + +// Scan the function and build: +// - list of all BarrierDesc‘s +// - per-BB memory-scope summary +static void collectBarriersAndMemInfo(Function &F, + SmallVectorImpl &Barriers, + BBMemInfoMap &BBMemInfo) { + for (auto &BB : F) { + RegionMemScope BlockScope = RegionMemScope::None; + + for (auto &I : BB) { + // Update memory info. + RegionMemScope InstScope = classifyMemScope(&I); + BlockScope = std::max(BlockScope, InstScope); + + // Collect barriers. + if (auto *CI = dyn_cast(&I)) { + Function *Callee = CI->getCalledFunction(); + if (!Callee) { + BlockScope = RegionMemScope::Unknown; + continue; + } + + // Check if this is a control/memory barrier call and store it. + StringRef Name = Callee->getName(); + auto getConst = [&](uint32_t idx) -> uint32_t { + if (auto *C = dyn_cast(CI->getArgOperand(idx))) + return C->getZExtValue(); + return static_cast(Scope::Unknown); + }; + if (Name == CONTROL_BARRIER) { + LLVM_DEBUG(dbgs() << "Collected ControlBarrier: " << *CI << "\n"); + BarrierDesc BD = {CI, static_cast(getConst(0)), + static_cast(getConst(1)), getConst(2)}; + BD.Semantic = canonicalizeSemantic(BD.Semantic); + Barriers.emplace_back(BD); + } else if (Name == MEMORY_BARRIER) { + LLVM_DEBUG(dbgs() << "Collected MemoryBarrier: " << *CI << "\n"); + BarrierDesc BD = {CI, Scope::Invocation, + static_cast(getConst(0)), getConst(1)}; + BD.Semantic = canonicalizeSemantic(BD.Semantic); + Barriers.emplace_back(BD); + } + } + } + BBMemInfo[&BB] = BlockScope; + } +} + +// Check if an instruction is an ITT wrapper call. +static bool isITT(Instruction *Inst) { + if (CallInst *CI = dyn_cast(Inst)) { + if (Function *Callee = CI->getCalledFunction()) { + StringRef Name = Callee->getName(); + if (Name == ITT_RESUME || Name == ITT_BARRIER) + return true; + } + } + return false; +} + +// Remove a single barrier CallInst and drop its surrounding ITT calls. +static bool eraseBarrierWithITT(BarrierDesc &BD) { + if (BD.CI == nullptr) + return false; + SmallPtrSet ToErase; + CallInst *CI = BD.CI; + LLVM_DEBUG(dbgs() << "Erase barrier: " << *CI << "\n"); + // Look up/down for ITT markers. + if (auto *Prev = CI->getPrevNode()) + if (isITT(Prev)) + ToErase.insert(Prev); + if (auto *Next = CI->getNextNode()) + if (isITT(Next)) + ToErase.insert(Next); + ToErase.insert(CI); + BD.CI = nullptr; + + for (auto *I : ToErase) { + I->dropAllReferences(); + I->eraseFromParent(); + } + return !ToErase.empty(); +} + +// True if no fenced accesses of MemScope appear in [A->next, B). +static bool noFencedMemAccessesBetween(CallInst *A, CallInst *B, + RegionMemScope Required, + const BBMemInfoMap &BBMemInfo) { + LLVM_DEBUG(dbgs() << "Checking for fenced accesses between: " << *A << " and " + << *B << "\n"); + RegionMemScope BBMemScope = BBMemInfo.lookup(A->getParent()); + if (BBMemScope == RegionMemScope::Unknown || + Required == RegionMemScope::Unknown) { + LLVM_DEBUG(dbgs() << "noFencedMemAccessesBetween(" << *A << ", " << *B + << ") returned " << false << "\n"); + return false; + } + + // Early exit in case if the whole block has no accesses wider or equal to + // required. + if (BBMemScope < Required) { + LLVM_DEBUG(dbgs() << "noFencedMemAccessesBetween(" << *A << ", " << *B + << ") returned " << true << "\n"); + return true; + } + + if (BBMemScope == RegionMemScope::None) { + LLVM_DEBUG(dbgs() << "noFencedMemAccessesBetween(" << *A << ", " << *B + << ") returned " << true << "\n"); + return true; + } + for (auto It = ++BasicBlock::iterator(A), End = BasicBlock::iterator(B); + It != End; ++It) { + auto InstScope = classifyMemScope(&*It); + if (InstScope == RegionMemScope::Unknown || InstScope >= Required) { + LLVM_DEBUG(dbgs() << "noFencedMemAccessesBetween(" << *A << ", " << *B + << ") returned " << false << "\n"); + return false; + } + } + LLVM_DEBUG(dbgs() << "noFencedMemAccessesBetween(" << *A << ", " << *B + << ") returned " << true << "\n"); + return true; +} + +// Helper to check if a whole block (or a slice) contains accesses fenced by +// 'Required'. +static bool hasFencedAccesses(BasicBlock *BB, RegionMemScope Required, + const BBMemInfoMap &BBMemInfo, + Instruction *Start = nullptr, + Instruction *End = nullptr) { + LLVM_DEBUG(dbgs() << "Checking for fenced accesses in basic block\n"); + // Shortcut: whole BB without barrier scan - return based on BBMemInfo's info. + if (!Start && !End) { + RegionMemScope BlockScope = BBMemInfo.lookup(BB); + return BlockScope == RegionMemScope::Unknown || BlockScope >= Required; + } + auto It = Start ? std::next(BasicBlock::iterator(Start)) : BB->begin(); + auto Finish = End ? BasicBlock::iterator(End) : BB->end(); + for (; It != Finish; ++It) { + RegionMemScope S = classifyMemScope(&*It); + if (S == RegionMemScope::Unknown || S >= Required) + return true; + } + return false; +} + +/// Return true if no accesses of >= Required scope occur on *every* path +/// from A to B through the CFG. If A==nullptr, start at EntryBlock; if +/// B==nullptr, end at all exit blocks. +static bool noFencedAccessesCFG(CallInst *A, CallInst *B, + RegionMemScope Required, + const BBMemInfoMap &BBMemInfo) { + LLVM_DEBUG(dbgs() << "Checking for fenced accesses between: " << *A << " and " + << *B << " in CFG" << "\n"); + if (Required == RegionMemScope::Unknown) + return false; + // Build the set of blocks that can reach B. + SmallPtrSet ReachB; + if (B) { + SmallVector Stack{B->getParent()}; + ReachB.insert(B->getParent()); + while (!Stack.empty()) { + BasicBlock *Cur = Stack.pop_back_val(); + for (BasicBlock *Pred : predecessors(Cur)) + if (ReachB.insert(Pred).second) + Stack.push_back(Pred); + } + } + + // Shortcut: same block and both non-null. + if (A && B && A->getParent() == B->getParent()) + return noFencedMemAccessesBetween(A, B, Required, BBMemInfo); + + Function *F = (A ? A->getFunction() : B->getFunction()); + BasicBlock *Entry = &F->getEntryBlock(); + + // Worklist entries: (BasicBlock, Instruction* startPoint). + SmallVector, 8> Worklist; + SmallPtrSet Visited; + + // Initialize the worklist from CI or ... + if (A) { + Worklist.emplace_back(A->getParent(), A); + Visited.insert(A->getParent()); + } else { + // ... from kernel's entry. + Worklist.emplace_back(Entry, /*start at beginning*/ nullptr); + Visited.insert(Entry); + } + + // Simple BFS-like traversal of the CFG to find all paths from A to B. + while (!Worklist.empty()) { + auto [BB, StartInst] = Worklist.pop_back_val(); + // Check if BB is reachable from B. + if (B && !ReachB.contains(BB)) + continue; + + // If we've reached the block containing B, only scan up to B. + if (B && BB == B->getParent()) { + if (hasFencedAccesses(BB, Required, BBMemInfo, StartInst, B)) + return false; + // Do not descend past B block. + continue; + } + + // If we're scanning to exit and this is a terminator + // block, check from StartInst to the end of BB and then continue to no + // successors. + if (!B && BB->getTerminator()->getNumSuccessors() == 0) { + if (hasFencedAccesses(BB, Required, BBMemInfo, StartInst, nullptr)) { + LLVM_DEBUG(dbgs() << "noFencedAccessesCFG(" << *A << ", " << *B + << ") returned " << false << "\n"); + return false; + } + // Do not enqueue successors (there are none). + continue; + } + + // Otherwise, scan entire block. + if (hasFencedAccesses(BB, Required, BBMemInfo, StartInst, nullptr)) { + LLVM_DEBUG(dbgs() << "noFencedAccessesCFG(" << *A << ", " << *B + << ") returned " << false << "\n"); + return false; + } + + // Enqueue successors. + for (BasicBlock *Succ : successors(BB)) + if ((!B || ReachB.contains(Succ)) && Visited.insert(Succ).second) + Worklist.emplace_back(Succ, /*no partial start*/ nullptr); + } + + // If we never saw a disallowed memory access on any path, it's safe. + LLVM_DEBUG(dbgs() << "noFencedAccessesCFG(" << *A << ", " << *B + << ") returned " << true << "\n"); + return true; +} + +// The back-to-back elimination on one BB. +static bool eliminateBackToBackInBB(BasicBlock *BB, + SmallVectorImpl &Barriers, + const BBMemInfoMap &BBMemInfo) { + SmallVector Survivors; + bool Changed = false; + RegionMemScope BlockScope = + BB ? BBMemInfo.lookup(BB) : RegionMemScope::Unknown; + + // If there are no memory accesses requiring synchronization in this block, + // collapse all barriers to the single largest one. + if (BlockScope == RegionMemScope::None) { + bool HasUnknown = llvm::any_of(Barriers, [](const BarrierDesc &BD) { + return BD.ExecScope == Scope::Unknown || BD.MemScope == Scope::Unknown; + }); + if (!HasUnknown) { + LLVM_DEBUG( + dbgs() << "Erasing barrier in basic block with no memory accesses\n"); + // Pick the barrier with the widest scope. + auto Best = std::max_element( + Barriers.begin(), Barriers.end(), [](auto &A, auto &B) { + // First prefer the barrier whose semantics fence more memory + + // stronger ordering. + if (semanticsSuperset(B.Semantic, A.Semantic) && + !semanticsSuperset(A.Semantic, B.Semantic)) + return true; + if (semanticsSuperset(A.Semantic, B.Semantic) && + !semanticsSuperset(B.Semantic, A.Semantic)) + return false; + // Then fall back to exec/mem‐scope width as before: + auto CmpExec = compareScopesWithWeights(B.ExecScope, A.ExecScope); + if (CmpExec != CompareRes::EQUAL) + return CmpExec == CompareRes::BIGGER; + auto CmpMem = compareScopesWithWeights(B.MemScope, A.MemScope); + return CmpMem == CompareRes::BIGGER; + }); + + // Remove all other barriers in the block. + llvm::erase_if(Barriers, [&](BarrierDesc &BD) { + if (&BD == &*Best) + return false; + Changed |= eraseBarrierWithITT(BD); + return true; + }); + return Changed; + } + } + + // Otherwise do a sliding window compare of each barrier against the + // last survivor. + for (auto &Cur : Barriers) { + if (!Cur.CI) + continue; // already removed + while (!Survivors.empty()) { + BarrierDesc &Last = Survivors.back(); + uint32_t LastSem = canonicalizeSemantic(Last.Semantic); + uint32_t CurSem = canonicalizeSemantic(Cur.Semantic); + uint32_t MergedSem = mergeSemantics(LastSem, CurSem); + + auto CmpExec = compareScopesWithWeights(Last.ExecScope, Cur.ExecScope); + auto CmpMem = compareScopesWithWeights(Last.MemScope, Cur.MemScope); + RegionMemScope FenceLast = getBarrierFencedScope(Last); + RegionMemScope FenceCur = getBarrierFencedScope(Cur); + + // If either scope is unknown, we cannot merge. + if (CmpExec == CompareRes::UNKNOWN || CmpMem == CompareRes::UNKNOWN || + FenceLast == RegionMemScope::Unknown || + FenceCur == RegionMemScope::Unknown) + break; + + auto *Int32Ty = Type::getInt32Ty(Last.CI->getContext()); + // If the execution and memory scopes of the barriers are equal, we can + // merge them if there are no accesses that only one of the barriers + // would need to fence. + RegionMemScope BetweenScope = std::min(FenceLast, FenceCur); + if (CmpExec == CompareRes::EQUAL && CmpMem == CompareRes::EQUAL) { + if (semanticsSuperset(LastSem, CurSem) && + noFencedMemAccessesBetween(Last.CI, Cur.CI, BetweenScope, + BBMemInfo)) { + if (MergedSem != LastSem) { + Last.CI->setArgOperand(2, ConstantInt::get(Int32Ty, MergedSem)); + Last.Semantic = MergedSem; + } + Changed |= eraseBarrierWithITT(Cur); + break; + } + if (semanticsSuperset(CurSem, LastSem) && + noFencedMemAccessesBetween(Last.CI, Cur.CI, BetweenScope, + BBMemInfo)) { + if (MergedSem != CurSem) { + Cur.CI->setArgOperand(2, ConstantInt::get(Int32Ty, MergedSem)); + Cur.Semantic = MergedSem; + } + Changed |= eraseBarrierWithITT(Last); + Survivors.pop_back(); + continue; + } + if (noFencedMemAccessesBetween(Last.CI, Cur.CI, BetweenScope, + BBMemInfo)) { + Last.CI->setArgOperand(2, ConstantInt::get(Int32Ty, MergedSem)); + Last.Semantic = MergedSem; + Changed |= eraseBarrierWithITT(Cur); + } + break; + } + // If the execution or memory scope of the barriers is not equal, we + // can only merge if one is a superset of the other and there are no + // accesses that only the other barrier would need to fence. + if ((CmpExec == CompareRes::BIGGER || CmpMem == CompareRes::BIGGER) && + semanticsSuperset(LastSem, CurSem) && + noFencedMemAccessesBetween(Last.CI, Cur.CI, BetweenScope, + BBMemInfo)) { + if (MergedSem != LastSem) { + Last.CI->setArgOperand(2, ConstantInt::get(Int32Ty, MergedSem)); + Last.Semantic = MergedSem; + } + Changed |= eraseBarrierWithITT(Cur); + break; + } + if ((CmpExec == CompareRes::SMALLER || CmpMem == CompareRes::SMALLER) && + semanticsSuperset(CurSem, LastSem) && + noFencedMemAccessesBetween(Last.CI, Cur.CI, BetweenScope, + BBMemInfo)) { + if (MergedSem != CurSem) { + Cur.CI->setArgOperand(2, ConstantInt::get(Int32Ty, MergedSem)); + Cur.Semantic = MergedSem; + } + Changed |= eraseBarrierWithITT(Last); + Survivors.pop_back(); + continue; + } + break; + } + if (Cur.CI) // Still alive? + Survivors.emplace_back(Cur); + } + + // If we removed any, replace Barriers with the survivors. + if (Survivors.size() != Barriers.size()) { + Barriers.clear(); + Barriers.append(Survivors.begin(), Survivors.end()); + Changed = true; + } + return Changed; +} + +// Walk the whole CFG once, first trying to erase fully–redundant +// barriers and, if that is impossible, trying to downgrade +// Cross-work-group barriers that are safely covered by another global fence. +static bool optimizeBarriersCFG(SmallVectorImpl &Barriers, + DominatorTree &DT, PostDominatorTree &PDT, + const BBMemInfoMap &BBMemInfo) { + bool Changed = false; + + for (BarrierDesc *B : Barriers) { + if (!B->CI) + continue; // Already removed + + bool Removed = false; + bool IsGlobalB = + (B->MemScope == Scope::Device || B->MemScope == Scope::CrossDevice || + (B->Semantic & + static_cast(MemorySemantics::CrossWorkgroupMemory))); + BarrierDesc *DowngradeCand = nullptr; + + for (BarrierDesc *A : Barriers) { + if (A == B || !A->CI) + continue; + + // Elimination check. + auto ExecCmp = compareScopesWithWeights(A->ExecScope, B->ExecScope); + auto MemCmp = compareScopesWithWeights(A->MemScope, B->MemScope); + bool ScopesCover = + (ExecCmp == CompareRes::BIGGER || ExecCmp == CompareRes::EQUAL) && + (MemCmp == CompareRes::BIGGER || MemCmp == CompareRes::EQUAL); + bool SemCover = (A->Semantic & B->Semantic) == B->Semantic; + bool ADominatesB = DT.dominates(A->CI, B->CI); + if (ScopesCover && SemCover) { + RegionMemScope Fence = getBarrierMaxFencedScope(*A); + // FIXME: this check is way too conservative. + if (Fence != RegionMemScope::Unknown && ADominatesB && + PDT.dominates(B->CI, A->CI) && + noFencedAccessesCFG(A->CI, B->CI, Fence, BBMemInfo)) { + Changed |= eraseBarrierWithITT(*B); + Removed = true; + break; + } + } + + // Downgrade check. + if (!Removed && IsGlobalB && !DowngradeCand) { + bool IsGlobalA = + (A->MemScope == Scope::Device || + A->MemScope == Scope::CrossDevice || + (A->Semantic & + static_cast(MemorySemantics::CrossWorkgroupMemory))); + if (IsGlobalA) { + if (DT.dominates(A->CI, B->CI) && + noFencedAccessesCFG(A->CI, B->CI, RegionMemScope::Global, + BBMemInfo)) { + DowngradeCand = A; + } else if (PDT.dominates(A->CI, B->CI) && + noFencedAccessesCFG(B->CI, A->CI, RegionMemScope::Global, + BBMemInfo)) { + DowngradeCand = A; + } + } + } + } + + if (Removed) + continue; + + if (DowngradeCand) { + BarrierDesc &A = *DowngradeCand; + BarrierDesc &R = *B; + uint32_t mergedSem = mergeSemantics(A.Semantic, R.Semantic); + LLVMContext &Ctx = R.CI->getContext(); + const bool IsControlBarrier = + R.CI->getCalledFunction()->getName() == CONTROL_BARRIER; + Type *Int32Ty = Type::getInt32Ty(Ctx); + + // Merge ordering semantics. + if (mergedSem != R.Semantic) { + R.CI->setArgOperand(IsControlBarrier ? 2 : 1, + ConstantInt::get(Int32Ty, mergedSem)); + R.Semantic = mergedSem; + } + + // Downgrade CrossWorkgroup -> Workgroup semantics. + const uint32_t CrossMask = + static_cast(MemorySemantics::CrossWorkgroupMemory); + if (R.Semantic & CrossMask) { + uint32_t NewSem = + (R.Semantic & ~CrossMask) | + static_cast(MemorySemantics::WorkgroupMemory); + R.CI->setArgOperand(IsControlBarrier ? 2 : 1, + ConstantInt::get(Int32Ty, NewSem)); + R.Semantic = NewSem; + } + + // Lower the SPIR-V MemScope operand to Workgroup. + R.CI->setArgOperand( + IsControlBarrier ? 1 : 0, + ConstantInt::get(Int32Ty, static_cast(Scope::Workgroup))); + R.MemScope = Scope::Workgroup; + + LLVM_DEBUG(dbgs() << "Downgraded global barrier: " << *R.CI << "\n"); + Changed = true; + } + } + + return Changed; +} + +// True if BD is the first real instruction of the function. +static bool isAtKernelEntry(const BarrierDesc &BD) { + BasicBlock &Entry = BD.CI->getFunction()->getEntryBlock(); + if (BD.CI->getParent() != &Entry) + return false; + + for (Instruction &I : Entry) { + if (&I == BD.CI) + break; + if (classifyMemScope(&I) != RegionMemScope::None) + return false; + } + + return true; +} + +// True if BD is immediately before a return/unreachable and nothing follows. +static bool isAtKernelExit(const BarrierDesc &BD) { + BasicBlock *BB = BD.CI->getParent(); + Instruction *Term = BB->getTerminator(); + if (!isa(Term) && !isa(Term)) + return false; + + for (Instruction *I = BD.CI->getNextNode(); I && I != Term; + I = I->getNextNode()) + if (classifyMemScope(I) != RegionMemScope::None) + return false; + + return BD.CI->getNextNonDebugInstruction() == Term; +} + +// Remove barriers that appear at the very beginning or end of a kernel +// function. +static bool eliminateBoundaryBarriers(SmallVectorImpl &Barriers, + BBMemInfoMap &BBMemInfo) { + bool Changed = false; + for (auto *BPtr : Barriers) { + BarrierDesc &B = *BPtr; + if (!B.CI) + continue; + // Only for real SPIR kernels: + if (B.CI->getFunction()->getCallingConv() != CallingConv::SPIR_KERNEL) + continue; + RegionMemScope Fence = getBarrierFencedScope(B); + // entry: no fenced accesses on *any* path from entry to B.CI. + if (isAtKernelEntry(B) && noFencedAccessesCFG(/*pretend A = entry*/ nullptr, + B.CI, Fence, BBMemInfo)) { + Changed |= eraseBarrierWithITT(B); + continue; + } + // exit: no fenced accesses on every path from B.CI to return. + if (isAtKernelExit(B) && + noFencedAccessesCFG(B.CI, /*pretend B = exit*/ nullptr, Fence, + BBMemInfo)) { + Changed |= eraseBarrierWithITT(B); + } + } + return Changed; +} + +} // namespace + +PreservedAnalyses SYCLOptimizeBarriersPass::run(Function &F, + FunctionAnalysisManager &AM) { + if (F.getCallingConv() != CallingConv::SPIR_KERNEL) + return PreservedAnalyses::none(); + LLVM_DEBUG(dbgs() << "Running SYCLOptimizeBarriers on " << F.getName() + << "\n"); + SmallVector Barriers; + BBMemInfoMap BBMemInfo; + BarriersMap BarriersByBB; + SmallVector BarrierPtrs; + + // Analyse the function gathering barrier and memory scope of the region info. + collectBarriersAndMemInfo(F, Barriers, BBMemInfo); + for (auto &B : Barriers) + BarriersByBB[B.CI->getParent()].emplace_back(B); + + for (auto &Pair : BarriersByBB) + for (auto &BD : Pair.second) + BarrierPtrs.push_back(&BD); + + bool Changed = false; + // First remove 'at entry' and 'at exit' barriers if they fence nothing. + Changed |= eliminateBoundaryBarriers(BarrierPtrs, BBMemInfo); + // Then remove redundant barriers within a single basic block. + for (auto &BarrierBBPair : BarriersByBB) + Changed |= eliminateBackToBackInBB(BarrierBBPair.first, + BarrierBBPair.second, BBMemInfo); + + // TODO: hoist 2 barriers with the same predecessor BBs. + + // In the end eliminate or narrow barriers depending on DT and PDT analyses. + DominatorTree &DT = AM.getResult(F); + PostDominatorTree &PDT = AM.getResult(F); + + Changed |= optimizeBarriersCFG(BarrierPtrs, DT, PDT, BBMemInfo); + + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/atomic.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/atomic.ll new file mode 100644 index 0000000000000..6f0af104eb4ae --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/atomic.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Tests that atomic instructions are classified for region memory scope and +; allow barrier optimization. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +@L = external addrspace(3) global i32 + +define spir_kernel void @spv_atomic_local() { +; CHECK-LABEL: @spv_atomic_local( +; CHECK-NEXT: entry: +; CHECK-NEXT: call spir_func void @_Z19__spirv_AtomicStorePU3AS3iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(3) @L, i32 2, i32 896, i32 0) +; CHECK-NEXT: ret void +; +entry: + call void @_Z22__spirv_ControlBarrieriii(i32 1, i32 1, i32 912) + call spir_func void @_Z19__spirv_AtomicStorePU3AS3iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(3) @L, i32 2, i32 896, i32 0) + call void @_Z22__spirv_ControlBarrieriii(i32 1, i32 1, i32 912) + ret void +} + +define spir_kernel void @llvm_atomic_local(ptr addrspace(3) %p) { +; CHECK-LABEL: @llvm_atomic_local( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add ptr addrspace(3) [[P:%.*]], i32 1 syncscope("workgroup") seq_cst, align 4 +; CHECK-NEXT: ret void +; +entry: + call void @_Z22__spirv_ControlBarrieriii(i32 1, i32 1, i32 912) + atomicrmw add ptr addrspace(3) %p, i32 1 syncscope("workgroup") seq_cst + call void @_Z22__spirv_ControlBarrieriii(i32 1, i32 1, i32 912) + ret void +} + +declare void @_Z22__spirv_ControlBarrieriii(i32, i32, i32) +declare spir_func void @_Z19__spirv_AtomicStorePU3AS3iN5__spv5Scope4FlagENS1_19MemorySemanticsMask4FlagEi(ptr addrspace(3), i32, i32, i32) diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/basic-optimizations.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/basic-optimizations.ll new file mode 100644 index 0000000000000..089adbdc4ef15 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/basic-optimizations.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; The test for various barrier optimizations performed by the +; sycl-optimize-barriers pass. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +@GV = external addrspace(1) global i32 + +@__spirv_BuiltInWorkgroupId = external addrspace(1) global <3 x i32> + +define spir_kernel void @bb_remove() { +; CHECK-LABEL: define spir_kernel void @bb_remove() { +; CHECK-NEXT: ret void +; + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + ret void +} + +define spir_kernel void @bb_remove_get_id() { +; CHECK-LABEL: define spir_kernel void @bb_remove_get_id() { +; CHECK-NEXT: [[ID1:%.*]] = load <3 x i32>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 16 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: [[ID2:%.*]] = load <3 x i32>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 16 +; CHECK-NEXT: [[ID3:%.*]] = load <3 x i32>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 16 +; CHECK-NEXT: ret void +; + %id1 = load <3 x i32>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + %id2 = load <3 x i32>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + %id3 = load <3 x i32>, ptr addrspace(1) @__spirv_BuiltInWorkgroupId + ret void +} + +define spir_kernel void @bb_private_access() { +; CHECK-LABEL: define spir_kernel void @bb_private_access() { +; CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 1, ptr [[TMP]], align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[TMP]], align 4 +; CHECK-NEXT: ret void +; + %tmp = alloca i32 + store i32 1, ptr %tmp + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + %v = load i32, ptr %tmp + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + ret void +} + +define spir_kernel void @bb_generic_alloca_access() { +; CHECK-LABEL: define spir_kernel void @bb_generic_alloca_access() { +; CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[TMP_CAST:%.*]] = addrspacecast ptr [[TMP]] to ptr addrspace(4) +; CHECK-NEXT: store i32 1, ptr addrspace(4) [[TMP_CAST]], align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: [[V:%.*]] = load i32, ptr addrspace(4) [[TMP_CAST]], align 4 +; CHECK-NEXT: ret void +; + %tmp = alloca i32 + %tmp_cast = addrspacecast ptr %tmp to ptr addrspace(4) + store i32 1, ptr addrspace(4) %tmp_cast + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + %v = load i32, ptr addrspace(4) %tmp_cast + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + ret void +} + +define spir_kernel void @cfg_remove(i1 %cond) { +; CHECK-LABEL: define spir_kernel void @cfg_remove( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br i1 [[COND]], label %[[BB1:.*]], label %[[BB1]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: ret void +; +entry: + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + br i1 %cond, label %bb1, label %bb1 +bb1: + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + ret void +} + +define spir_kernel void @downgrade_global(ptr addrspace(3) %p) { +; CHECK-LABEL: define spir_kernel void @downgrade_global( +; CHECK-SAME: ptr addrspace(3) [[P:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: store i32 0, ptr addrspace(3) [[P]], align 4 +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: ret void +; +entry: + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 912) + store i32 0, ptr addrspace(3) %p + br label %bb1 +bb1: + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 912) + ret void +} + +define spir_kernel void @unknown_scope(i32 %exec, i32 %mem) { +; CHECK-LABEL: define spir_kernel void @unknown_scope( +; CHECK-SAME: i32 [[EXEC:%.*]], i32 [[MEM:%.*]]) { +; CHECK-NEXT: ret void +; + call void @_Z22__spirv_ControlBarrieriii(i32 %exec, i32 %mem, i32 noundef 0) + ret void +} + +define spir_kernel void @unknown_memory() { +; CHECK-LABEL: define spir_kernel void @unknown_memory() { +; CHECK-NEXT: call void @unknown() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: ret void +; + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + call void @unknown() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + ret void +} + +define spir_kernel void @downgrade_semantics() { +; CHECK-LABEL: define spir_kernel void @downgrade_semantics() { +; CHECK-NEXT: ret void +; + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 912) + ret void +} + +define spir_kernel void @no_downgrade(ptr addrspace(1) %p) { +; CHECK-LABEL: define spir_kernel void @no_downgrade( +; CHECK-SAME: ptr addrspace(1) [[P:%.*]]) { +; CHECK-NEXT: store i32 0, ptr addrspace(1) [[P]], align 4 +; CHECK-NEXT: ret void +; + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 912) + store i32 0, ptr addrspace(1) %p, align 4 + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 912) + ret void +} + +define spir_kernel void @semantics_none() { +; CHECK-LABEL: define spir_kernel void @semantics_none() { +; CHECK-NEXT: ret void +; + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 0) + ret void +} + +define spir_func void @cfg_remove_sem_subsume(i1 %cond) { +; CHECK-LABEL: define spir_func void @cfg_remove_sem_subsume( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 912) +; CHECK-NEXT: br i1 [[COND]], label %[[BB1:.*]], label %[[BB1]] +; CHECK: [[BB1]]: +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 400) +; CHECK-NEXT: ret void +; +entry: + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 912) + br i1 %cond, label %bb1, label %bb1 +bb1: + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 400) + ret void +} + +declare void @unknown() + +declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) + diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/memory-barrier.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/memory-barrier.ll new file mode 100644 index 0000000000000..7dacbd03689dc --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/memory-barrier.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Simple tests for optimizing __spirv_MemoryBarrier calls. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +define spir_kernel void @mem_bb_remove() { +; CHECK-LABEL: define spir_kernel void @mem_bb_remove() { +; CHECK-NEXT: ret void +; + call void @_Z21__spirv_MemoryBarrierii(i32 noundef 2, i32 noundef 896) + call void @_Z21__spirv_MemoryBarrierii(i32 noundef 2, i32 noundef 896) + ret void +} + +define spir_kernel void @combine_with_control() { +; CHECK-LABEL: define spir_kernel void @combine_with_control() { +; CHECK-NEXT: ret void +; + call void @_Z21__spirv_MemoryBarrierii(i32 noundef 2, i32 noundef 896) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 896) + ret void +} + +declare void @_Z21__spirv_MemoryBarrierii(i32 noundef, i32 noundef) +declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-acquire-release.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-acquire-release.ll new file mode 100644 index 0000000000000..51581f2d5f134 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-acquire-release.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Test merging of acquire and release barriers into acquire-release. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +@GV = external addrspace(3) global i32 + +define spir_kernel void @acq_rel_merge() { +; CHECK-LABEL: define spir_kernel void @acq_rel_merge() { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 258) +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 258) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 260) + ret void +} + +declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-memory-fences.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-memory-fences.ll new file mode 100644 index 0000000000000..d2609eb7a4ef5 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-memory-fences.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Test merging of workgroup and cross-workgroup memory fences. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +@GV = external addrspace(3) global i32 + +define spir_kernel void @mem_fence_merge() { +; CHECK-LABEL: define spir_kernel void @mem_fence_merge() { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 256) +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 256) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 512) + ret void +} + +declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-semantics.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-semantics.ll new file mode 100644 index 0000000000000..6124d75b4a4f4 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/merge-semantics.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Test merging of adjacent barriers with different semantics. + +@GV = external addrspace(3) global i32 + +define spir_kernel void @merge_mem() { +; CHECK-LABEL: define spir_kernel void @merge_mem() { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 256) +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 256) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 512) + ret void +} + +define spir_kernel void @combine_acq_rel() { +; CHECK-LABEL: define spir_kernel void @combine_acq_rel() { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 8) +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 2) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 4) + ret void +} + +define spir_kernel void @drop_no_fence() { +; CHECK-LABEL: define spir_kernel void @drop_no_fence() { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 256) + ret void +} + + declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/multi-dominating.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/multi-dominating.ll new file mode 100644 index 0000000000000..cccbdf1009007 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/multi-dominating.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Test that multiple dominating global barriers combine semantics and later barriers are downgraded. + +@glob = external addrspace(1) global i32 + +define spir_kernel void @multi_series() { +; CHECK-LABEL: define spir_kernel void @multi_series() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: store i32 0, ptr addrspace(1) @glob, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 520) +; CHECK-NEXT: ret void +; +entry: + store i32 0, ptr addrspace(1) @glob, align 4 + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 514) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 516) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 1, i32 noundef 1, i32 noundef 520) + ret void +} + +declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/real-life-test.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/real-life-test.ll new file mode 100644 index 0000000000000..b6c79c85ce8ec --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/real-life-test.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; ModuleID = 'test-sycl-spir64-unknown-unknown.bc' +source_filename = "test.cpp" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" +target triple = "spir64-unknown-unknown" + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_7nd_itemILi1EEEE_ = comdat any + +@__spirv_BuiltInWorkgroupId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 +@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 +@__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 +@__spirv_BuiltInGlobalSize = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 + +; Function Attrs: convergent mustprogress norecurse nounwind +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_7nd_itemILi1EEEE_(ptr addrspace(3) noundef align 4 %_arg_local, ptr addrspace(1) noundef align 4 %_arg_input, ptr addrspace(1) noundef align 4 %_arg_output) local_unnamed_addr #0 comdat !kernel_arg_buffer_location !6 !kernel_arg_runtime_aligned !7 !kernel_arg_exclusive_ptr !7 !sycl_fixed_targets !8 !sycl_kernel_omit_args !9 { +; CHECK-LABEL: define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_7nd_itemILi1EEEE_( +; CHECK-SAME: ptr addrspace(3) noundef align 4 [[_ARG_LOCAL:%.*]], ptr addrspace(1) noundef align 4 [[_ARG_INPUT:%.*]], ptr addrspace(1) noundef align 4 [[_ARG_OUTPUT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] comdat !kernel_arg_buffer_location [[META6:![0-9]+]] !kernel_arg_runtime_aligned [[META7:![0-9]+]] !kernel_arg_exclusive_ptr [[META7]] !sycl_fixed_targets [[META8:![0-9]+]] !sycl_kernel_omit_args [[META9:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32, !noalias [[META10:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32 +; CHECK-NEXT: [[ARRAYIDX_I16:%.*]] = getelementptr inbounds nuw float, ptr addrspace(3) [[_ARG_LOCAL]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalSize, align 32 +; CHECK-NEXT: br label %[[FOR_COND_I:.*]] +; CHECK: [[FOR_COND_I]]: +; CHECK-NEXT: [[I_0_IN_I:%.*]] = phi i64 [ [[TMP0]], %[[ENTRY]] ], [ [[ADD_I:%.*]], %[[FOR_BODY_I:.*]] ] +; CHECK-NEXT: [[I_0_I:%.*]] = trunc i64 [[I_0_IN_I]] to i32 +; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[I_0_I]], 262144 +; CHECK-NEXT: br i1 [[CMP_I]], label %[[FOR_BODY_I]], label %[[FOR_COND_CLEANUP_I:.*]] +; CHECK: [[FOR_COND_CLEANUP_I]]: +; CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 912) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32, !noalias [[META17:![0-9]+]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr addrspace(3) [[_ARG_LOCAL]], i64 [[TMP3]] +; CHECK-NEXT: br label %[[FOR_COND9_I:.*]] +; CHECK: [[FOR_BODY_I]]: +; CHECK-NEXT: [[SEXT_I:%.*]] = shl i64 [[I_0_IN_I]], 32 +; CHECK-NEXT: [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT_I]], 32 +; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[_ARG_INPUT]], i64 [[IDXPROM_I]] +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA24:![0-9]+]] +; CHECK-NEXT: store float [[TMP5]], ptr addrspace(3) [[ARRAYIDX_I16]], align 4, !tbaa [[TBAA24]] +; CHECK-NEXT: [[ADD_I]] = add i64 [[IDXPROM_I]], [[TMP2]] +; CHECK-NEXT: br label %[[FOR_COND_I]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK: [[FOR_COND9_I]]: +; CHECK-NEXT: [[OFFSET_0_I:%.*]] = phi i32 [ 1, %[[FOR_COND_CLEANUP_I]] ], [ [[MUL_I:%.*]], %[[FOR_BODY13_I:.*]] ] +; CHECK-NEXT: [[CMP10_I:%.*]] = icmp samesign ult i32 [[OFFSET_0_I]], 256 +; CHECK-NEXT: br i1 [[CMP10_I]], label %[[FOR_BODY13_I]], label %[[FOR_COND_CLEANUP11_I:.*]] +; CHECK: [[FOR_COND_CLEANUP11_I]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32, !noalias [[META30:![0-9]+]] +; CHECK-NEXT: [[CMP_I18:%.*]] = icmp eq i64 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[CMP_I18]], label %[[IF_THEN_I:.*]], label %[[_ZZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_ENKULNS0_7ND_ITEMILI1EEEE_CLES5__EXIT:.*]] +; CHECK: [[FOR_BODY13_I]]: +; CHECK-NEXT: [[CONV17_I:%.*]] = zext nneg i32 [[OFFSET_0_I]] to i64 +; CHECK-NEXT: [[ARRAYIDX_I21:%.*]] = getelementptr float, ptr addrspace(3) [[TMP4]], i64 [[CONV17_I]] +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr addrspace(3) [[ARRAYIDX_I21]], align 4, !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr addrspace(3) [[TMP4]], align 4, !tbaa [[TBAA24]] +; CHECK-NEXT: [[ADD24_I:%.*]] = fadd float [[TMP8]], [[TMP7]] +; CHECK-NEXT: store float [[ADD24_I]], ptr addrspace(3) [[TMP4]], align 4, !tbaa [[TBAA24]] +; CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 400) #[[ATTR2]] +; CHECK-NEXT: [[MUL_I]] = shl nuw nsw i32 [[OFFSET_0_I]], 1 +; CHECK-NEXT: br label %[[FOR_COND9_I]], !llvm.loop [[LOOP37:![0-9]+]] +; CHECK: [[IF_THEN_I]]: +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(3) [[_ARG_LOCAL]], align 4, !tbaa [[TBAA24]] +; CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 32, !noalias [[META38:![0-9]+]] +; CHECK-NEXT: [[ARRAYIDX34_I:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[_ARG_OUTPUT]], i64 [[TMP10]] +; CHECK-NEXT: store float [[TMP9]], ptr addrspace(1) [[ARRAYIDX34_I]], align 4, !tbaa [[TBAA24]] +; CHECK-NEXT: br label %[[_ZZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_ENKULNS0_7ND_ITEMILI1EEEE_CLES5__EXIT]] +; CHECK: [[_ZZZ4MAINENKULRN4SYCL3_V17HANDLEREE_CLES2_ENKULNS0_7ND_ITEMILI1EEEE_CLES5__EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %0 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32, !noalias !10 + %1 = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32 + %arrayidx.i16 = getelementptr inbounds nuw float, ptr addrspace(3) %_arg_local, i64 %1 + %2 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalSize, align 32 + br label %for.cond.i + +for.cond.i: ; preds = %for.body.i, %entry + %i.0.in.i = phi i64 [ %0, %entry ], [ %add.i, %for.body.i ] + %i.0.i = trunc i64 %i.0.in.i to i32 + %cmp.i = icmp slt i32 %i.0.i, 262144 + br i1 %cmp.i, label %for.body.i, label %for.cond.cleanup.i + +for.cond.cleanup.i: ; preds = %for.cond.i + tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 912) #2 + %3 = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32, !noalias !17 + %4 = getelementptr float, ptr addrspace(3) %_arg_local, i64 %3 + br label %for.cond9.i + +for.body.i: ; preds = %for.cond.i + %sext.i = shl i64 %i.0.in.i, 32 + %idxprom.i = ashr exact i64 %sext.i, 32 + %arrayidx.i = getelementptr inbounds float, ptr addrspace(1) %_arg_input, i64 %idxprom.i + %5 = load float, ptr addrspace(1) %arrayidx.i, align 4, !tbaa !24 + store float %5, ptr addrspace(3) %arrayidx.i16, align 4, !tbaa !24 + %add.i = add i64 %idxprom.i, %2 + br label %for.cond.i, !llvm.loop !28 + +for.cond9.i: ; preds = %for.body13.i, %for.cond.cleanup.i + %offset.0.i = phi i32 [ 1, %for.cond.cleanup.i ], [ %mul.i, %for.body13.i ] + %cmp10.i = icmp samesign ult i32 %offset.0.i, 256 + br i1 %cmp10.i, label %for.body13.i, label %for.cond.cleanup11.i + +for.cond.cleanup11.i: ; preds = %for.cond9.i + %6 = load i64, ptr addrspace(1) @__spirv_BuiltInLocalInvocationId, align 32, !noalias !30 + %cmp.i18 = icmp eq i64 %6, 0 + br i1 %cmp.i18, label %if.then.i, label %_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_.exit + +for.body13.i: ; preds = %for.cond9.i + %conv17.i = zext nneg i32 %offset.0.i to i64 + %arrayidx.i21 = getelementptr float, ptr addrspace(3) %4, i64 %conv17.i + %7 = load float, ptr addrspace(3) %arrayidx.i21, align 4, !tbaa !24 + %8 = load float, ptr addrspace(3) %4, align 4, !tbaa !24 + %add24.i = fadd float %8, %7 + store float %add24.i, ptr addrspace(3) %4, align 4, !tbaa !24 + tail call spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 912) #2 + %mul.i = shl nuw nsw i32 %offset.0.i, 1 + br label %for.cond9.i, !llvm.loop !37 + +if.then.i: ; preds = %for.cond.cleanup11.i + %9 = load float, ptr addrspace(3) %_arg_local, align 4, !tbaa !24 + %10 = load i64, ptr addrspace(1) @__spirv_BuiltInWorkgroupId, align 32, !noalias !38 + %arrayidx34.i = getelementptr inbounds float, ptr addrspace(1) %_arg_output, i64 %10 + store float %9, ptr addrspace(1) %arrayidx34.i, align 4, !tbaa !24 + br label %_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_.exit + +_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_7nd_itemILi1EEEE_clES5_.exit: ; preds = %for.cond.cleanup11.i, %if.then.i + ret void +} + +; Function Attrs: convergent nounwind +declare dso_local spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) local_unnamed_addr #1 + +declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...) + +attributes #0 = { convergent mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="test.cpp" "sycl-optlevel"="2" "uniform-work-group-size"="true" } +attributes #1 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #2 = { convergent nounwind } + +!llvm.module.flags = !{!0, !1, !2} +!opencl.spir.version = !{!3} +!spirv.Source = !{!4} +!llvm.ident = !{!5} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"sycl-device", i32 1} +!2 = !{i32 7, !"frame-pointer", i32 2} +!3 = !{i32 1, i32 2} +!4 = !{i32 4, i32 100000} +!5 = !{!"clang version 21.0.0git (https://github.com/intel/llvm.git 02aa83943d3480d7d55159309cdb0638d166c5b5)"} +!6 = !{i32 -1, i32 -1, i32 -1} +!7 = !{i1 true, i1 false, i1 false} +!8 = !{} +!9 = !{i1 false, i1 true, i1 true, i1 true, i1 false, i1 false} +!10 = !{!11, !13, !15} +!11 = distinct !{!11, !12, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!12 = distinct !{!12, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!13 = distinct !{!13, !14, !"_ZN7__spirv22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!14 = distinct !{!14, !"_ZN7__spirv22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!15 = distinct !{!15, !16, !"_ZNK4sycl3_V17nd_itemILi1EE13get_global_idEv: %agg.result"} +!16 = distinct !{!16, !"_ZNK4sycl3_V17nd_itemILi1EE13get_global_idEv"} +!17 = !{!18, !20, !22} +!18 = distinct !{!18, !19, !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!19 = distinct !{!19, !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!20 = distinct !{!20, !21, !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!21 = distinct !{!21, !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!22 = distinct !{!22, !23, !"_ZNK4sycl3_V17nd_itemILi1EE12get_local_idEv: %agg.result"} +!23 = distinct !{!23, !"_ZNK4sycl3_V17nd_itemILi1EE12get_local_idEv"} +!24 = !{!25, !25, i64 0} +!25 = !{!"float", !26, i64 0} +!26 = !{!"omnipotent char", !27, i64 0} +!27 = !{!"Simple C++ TBAA"} +!28 = distinct !{!28, !29} +!29 = !{!"llvm.loop.mustprogress"} +!30 = !{!31, !33, !35} +!31 = distinct !{!31, !32, !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!32 = distinct !{!32, !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!33 = distinct !{!33, !34, !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!34 = distinct !{!34, !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!35 = distinct !{!35, !36, !"_ZNK4sycl3_V15groupILi1EE12get_local_idEv: %agg.result"} +!36 = distinct !{!36, !"_ZNK4sycl3_V15groupILi1EE12get_local_idEv"} +!37 = distinct !{!37, !29} +!38 = !{!39, !41, !43, !45} +!39 = distinct !{!39, !40, !"_ZN7__spirv22InitSizesSTWorkgroupIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!40 = distinct !{!40, !"_ZN7__spirv22InitSizesSTWorkgroupIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!41 = distinct !{!41, !42, !"_ZN7__spirv15initWorkgroupIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!42 = distinct !{!42, !"_ZN7__spirv15initWorkgroupIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!43 = distinct !{!43, !44, !"_ZNK4sycl3_V17nd_itemILi1EE12get_group_idEv: %agg.result"} +!44 = distinct !{!44, !"_ZNK4sycl3_V17nd_itemILi1EE12get_group_idEv"} +!45 = distinct !{!45, !46, !"_ZNK4sycl3_V17nd_itemILi1EE9get_groupEv: %agg.result"} +!46 = distinct !{!46, !"_ZNK4sycl3_V17nd_itemILi1EE9get_groupEv"} +;. +; CHECK: [[META6]] = !{i32 -1, i32 -1, i32 -1} +; CHECK: [[META7]] = !{i1 true, i1 false, i1 false} +; CHECK: [[META8]] = !{} +; CHECK: [[META9]] = !{i1 false, i1 true, i1 true, i1 true, i1 false, i1 false} +; CHECK: [[META10]] = !{[[META11:![0-9]+]], [[META13:![0-9]+]], [[META15:![0-9]+]]} +; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]], !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +; CHECK: [[META12]] = distinct !{[[META12]], !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +; CHECK: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"_ZN7__spirv22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +; CHECK: [[META14]] = distinct !{[[META14]], !"_ZN7__spirv22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]], !"_ZNK4sycl3_V17nd_itemILi1EE13get_global_idEv: %agg.result"} +; CHECK: [[META16]] = distinct !{[[META16]], !"_ZNK4sycl3_V17nd_itemILi1EE13get_global_idEv"} +; CHECK: [[META17]] = !{[[META18:![0-9]+]], [[META20:![0-9]+]], [[META22:![0-9]+]]} +; CHECK: [[META18]] = distinct !{[[META18]], [[META19:![0-9]+]], !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +; CHECK: [[META19]] = distinct !{[[META19]], !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +; CHECK: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +; CHECK: [[META21]] = distinct !{[[META21]], !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +; CHECK: [[META22]] = distinct !{[[META22]], [[META23:![0-9]+]], !"_ZNK4sycl3_V17nd_itemILi1EE12get_local_idEv: %agg.result"} +; CHECK: [[META23]] = distinct !{[[META23]], !"_ZNK4sycl3_V17nd_itemILi1EE12get_local_idEv"} +; CHECK: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0} +; CHECK: [[META25]] = !{!"float", [[META26:![0-9]+]], i64 0} +; CHECK: [[META26]] = !{!"omnipotent char", [[META27:![0-9]+]], i64 0} +; CHECK: [[META27]] = !{!"Simple C++ TBAA"} +; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META29:![0-9]+]]} +; CHECK: [[META29]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META30]] = !{[[META31:![0-9]+]], [[META33:![0-9]+]], [[META35:![0-9]+]]} +; CHECK: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]], !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +; CHECK: [[META32]] = distinct !{[[META32]], !"_ZN7__spirv28InitSizesSTLocalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +; CHECK: [[META33]] = distinct !{[[META33]], [[META34:![0-9]+]], !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +; CHECK: [[META34]] = distinct !{[[META34]], !"_ZN7__spirv21initLocalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +; CHECK: [[META35]] = distinct !{[[META35]], [[META36:![0-9]+]], !"_ZNK4sycl3_V15groupILi1EE12get_local_idEv: %agg.result"} +; CHECK: [[META36]] = distinct !{[[META36]], !"_ZNK4sycl3_V15groupILi1EE12get_local_idEv"} +; CHECK: [[LOOP37]] = distinct !{[[LOOP37]], [[META29]]} +; CHECK: [[META38]] = !{[[META39:![0-9]+]], [[META41:![0-9]+]], [[META43:![0-9]+]], [[META45:![0-9]+]]} +; CHECK: [[META39]] = distinct !{[[META39]], [[META40:![0-9]+]], !"_ZN7__spirv22InitSizesSTWorkgroupIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +; CHECK: [[META40]] = distinct !{[[META40]], !"_ZN7__spirv22InitSizesSTWorkgroupIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +; CHECK: [[META41]] = distinct !{[[META41]], [[META42:![0-9]+]], !"_ZN7__spirv15initWorkgroupIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +; CHECK: [[META42]] = distinct !{[[META42]], !"_ZN7__spirv15initWorkgroupIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +; CHECK: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"_ZNK4sycl3_V17nd_itemILi1EE12get_group_idEv: %agg.result"} +; CHECK: [[META44]] = distinct !{[[META44]], !"_ZNK4sycl3_V17nd_itemILi1EE12get_group_idEv"} +; CHECK: [[META45]] = distinct !{[[META45]], [[META46:![0-9]+]], !"_ZNK4sycl3_V17nd_itemILi1EE9get_groupEv: %agg.result"} +; CHECK: [[META46]] = distinct !{[[META46]], !"_ZNK4sycl3_V17nd_itemILi1EE9get_groupEv"} +;. diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-back-to-back-barrier.ll similarity index 55% rename from llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll rename to llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-back-to-back-barrier.ll index 00edaefb9cc6c..8a32cc210138a 100644 --- a/llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-back-to-back-barrier.ll @@ -1,37 +1,37 @@ -; RUN: opt -passes=sycl-optimize-back-to-back-barrier -S < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s ; The test checks if back-to-back __spirv_ControlBarrier and ITT annotations are ; removed. -; CHECK-LABEL: define spir_func void @_Z3fooii(i32 %[[#Scope1:]], i32 %[[#Scope2:]]) -; CHECK: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 1, i32 noundef 912) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() -; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 2, i32 noundef 912) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() -; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 64, i32 noundef 2, i32 noundef 912) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() -; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 %[[#Scope1]], i32 noundef 2, i32 noundef 912) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() -; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 %[[#Scope2]], i32 noundef 2, i32 noundef 912) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() -; CHECK-NEXT: ret void - -; CHECK-LABEL: define dso_local void @_Z3booi -; CHECK: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() -; CHECK: call spir_func void @__itt_offload_wg_barrier_wrapper() -; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) -; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() + target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spirv64-unknown-unknown" -define spir_func void @_Z3fooii(i32 %0, i32 %1) { +@GV = external addrspace(3) global i32 + +define spir_kernel void @_Z3fooii(i32 %0, i32 %1) { +; CHECK-LABEL: define spir_kernel void @_Z3fooii( +; CHECK-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 400) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 64, i32 noundef 2, i32 noundef 400) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP0]], i32 noundef 2, i32 noundef 400) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP0]], i32 noundef 2, i32 noundef 400) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 [[TMP1]], i32 noundef 2, i32 noundef 400) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV call spir_func void @__itt_offload_wg_barrier_wrapper() call void @_Z22__spirv_ControlBarrieriii(i32 noundef 4, i32 noundef 1, i32 noundef 912) call spir_func void @__itt_offload_wi_resume_wrapper() @@ -76,6 +76,21 @@ define spir_func void @_Z3fooii(i32 %0, i32 %1) { } define dso_local void @_Z3booi(i32 noundef %0) local_unnamed_addr #0 { +; CHECK-LABEL: define dso_local void @_Z3booi( +; CHECK-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label %[[BB3:.*]], label %[[BB4:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: br label %[[BB4]] +; CHECK: [[BB4]]: +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: ret void +; %2 = icmp eq i32 %0, 0 br i1 %2, label %3, label %4 diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-subgroup-barrier.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-subgroup-barrier.ll new file mode 100644 index 0000000000000..df38d0a70c8e5 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBarriers/remove-subgroup-barrier.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=sycl-optimize-barriers -S < %s | FileCheck %s + +; Test removal of a subgroup barrier when followed by a workgroup barrier. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +@GV = external addrspace(3) global i32 + +define spir_kernel void @remove_subgroup() { +; CHECK-LABEL: define spir_kernel void @remove_subgroup() { +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(3) @GV, align 4 +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) +; CHECK-NEXT: ret void +; + %val = load i32, ptr addrspace(3) @GV + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 2, i32 noundef 0) + ret void +} + +declare void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef)