Skip to content

Commit 0033198

Browse files
committed
[AMDGPU] Insert s_wait_xcnt(0) before atomics to work around write-combining miss hazard
This patch adds a workaround for a hazzard on GFX1250, which inserts an `s_wait_xcnt(0)` instruction before any atomic operation that might write to memory. Fixes SWDEV-543703.
1 parent 622d52d commit 0033198

File tree

13 files changed

+356
-2
lines changed

13 files changed

+356
-2
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,12 @@ def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
895895
[FeatureFP8ConversionInsts]
896896
>;
897897

898+
def FeatureWriteCombiningMissesHazards : SubtargetFeature<"write-combining-misses-hazards",
899+
"HasWriteCombiningMissesHazards",
900+
"true",
901+
"Write combining misses hazards that require s_wait_cnt(0) before every atomic operation"
902+
>;
903+
898904
def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
899905
"HasPkFmacF16Inst",
900906
"true",
@@ -2145,6 +2151,7 @@ def FeatureISAVersion12_50 : FeatureSet<
21452151
FeatureXNACK,
21462152
FeatureClusters,
21472153
FeatureD16Writes32BitVgpr,
2154+
FeatureWriteCombiningMissesHazards,
21482155
]>;
21492156

21502157
def FeatureISAVersion12_51 : FeatureSet<
@@ -2945,6 +2952,8 @@ def HasGWS : Predicate<"Subtarget->hasGWS()">;
29452952
def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
29462953
def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
29472954

2955+
def HasWriteCombiningMissesHazards : Predicate<"Subtarget->hasWriteCombiningMissesHazards()">;
2956+
29482957
def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
29492958

29502959
def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
282282
bool HasPointSampleAccel = false;
283283
bool HasLdsBarrierArriveAtomic = false;
284284
bool HasSetPrioIncWgInst = false;
285-
285+
bool HasWriteCombiningMissesHazards = false;
286286
bool RequiresCOV6 = false;
287287
bool UseBlockVGPROpsForCSR = false;
288288
bool HasGloballyAddressableScratch = false;
@@ -1834,6 +1834,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18341834
return getGeneration() == GFX12;
18351835
}
18361836

1837+
bool hasWriteCombiningMissesHazards() const {
1838+
return HasWriteCombiningMissesHazards;
1839+
}
1840+
18371841
// Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
18381842
// read.
18391843
bool hasScratchBaseForwardingHazard() const {

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2086,6 +2086,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20862086
// Verify that the wait is actually needed.
20872087
ScoreBrackets.simplifyWaitcnt(Wait);
20882088

2089+
// An s_wait_xcnt(0) before every atomic store/RMW operation is required to
2090+
// work around the write combining misses hazard.
2091+
if (ST->hasWriteCombiningMissesHazards() && SIInstrInfo::isAtomic(MI) &&
2092+
SIInstrInfo::isVMEM(MI) && MI.mayStore())
2093+
Wait.XCnt = 0;
2094+
20892095
// When forcing emit, we need to skip terminators because that would break the
20902096
// terminators of the MBB if we emit a waitcnt between terminators.
20912097
if (ForceEmitZeroFlag && !MI.isTerminator())

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,6 +1764,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
17641764
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
17651765
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
17661766
; GFX1250-NEXT: s_wait_storecnt 0x0
1767+
; GFX1250-NEXT: s_wait_xcnt 0x0
17671768
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
17681769
; GFX1250-NEXT: s_wait_loadcnt 0x0
17691770
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -1802,6 +1803,7 @@ define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, doubl
18021803
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
18031804
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
18041805
; GFX1250-NEXT: s_wait_storecnt 0x0
1806+
; GFX1250-NEXT: s_wait_xcnt 0x0
18051807
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
18061808
; GFX1250-NEXT: s_wait_loadcnt 0x0
18071809
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -1842,6 +1844,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
18421844
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
18431845
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
18441846
; GFX1250-NEXT: s_wait_storecnt 0x0
1847+
; GFX1250-NEXT: s_wait_xcnt 0x0
18451848
; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18461849
; GFX1250-NEXT: s_wait_loadcnt 0x0
18471850
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -2088,6 +2091,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
20882091
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
20892092
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
20902093
; GFX1250-NEXT: s_wait_storecnt 0x0
2094+
; GFX1250-NEXT: s_wait_xcnt 0x0
20912095
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
20922096
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
20932097
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
@@ -2126,6 +2130,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
21262130
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
21272131
; GFX1250-NEXT: global_wb scope:SCOPE_DEV
21282132
; GFX1250-NEXT: s_wait_storecnt 0x0
2133+
; GFX1250-NEXT: s_wait_xcnt 0x0
21292134
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
21302135
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
21312136
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
@@ -2168,6 +2173,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
21682173
; GFX1250-NEXT: v_mov_b64_e32 v[2:3], 4.0
21692174
; GFX1250-NEXT: global_wb scope:SCOPE_SYS
21702175
; GFX1250-NEXT: s_wait_storecnt 0x0
2176+
; GFX1250-NEXT: s_wait_xcnt 0x0
21712177
; GFX1250-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
21722178
; GFX1250-NEXT: s_wait_loadcnt 0x0
21732179
; GFX1250-NEXT: global_inv scope:SCOPE_SYS

0 commit comments

Comments
 (0)