Skip to content

Commit 6cbe7d6

Browse files
committed
[AMDGPU] Insert s_wait_xcnt(0) before atomics to work around write-combining miss hazard
This patch adds a workaround for a hazzard on GFX1250, which inserts an `s_wait_xcnt(0)` instruction before any atomic operation that might write to memory. Fixes SWDEV-543703.
1 parent fcba304 commit 6cbe7d6

File tree

46 files changed

+1680
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+1680
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,12 @@ def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
895895
[FeatureFP8ConversionInsts]
896896
>;
897897

898+
def FeatureWriteCombiningHazzards : SubtargetFeature<"write-combining-hazzards",
899+
"HasWriteCombiningHazzards",
900+
"true",
901+
"Write combining hazards that require s_wait_cnt(0) before every atomic operation"
902+
>;
903+
898904
def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
899905
"HasPkFmacF16Inst",
900906
"true",
@@ -2145,6 +2151,7 @@ def FeatureISAVersion12_50 : FeatureSet<
21452151
FeatureXNACK,
21462152
FeatureClusters,
21472153
FeatureD16Writes32BitVgpr,
2154+
FeatureWriteCombiningHazzards,
21482155
]>;
21492156

21502157
def FeatureISAVersion12_51 : FeatureSet<
@@ -2945,6 +2952,8 @@ def HasGWS : Predicate<"Subtarget->hasGWS()">;
29452952
def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
29462953
def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
29472954

2955+
def HasWriteCombiningHazzards : Predicate<"Subtarget->hasWriteCombiningHazzards()">;
2956+
29482957
def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
29492958

29502959
def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,6 +1269,8 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
12691269
fixScratchBaseForwardingHazard(MI);
12701270
if (ST.setRegModeNeedsVNOPs())
12711271
fixSetRegMode(MI);
1272+
if (ST.hasWriteCombiningMissesBug())
1273+
fixWriteCombiningHazzards(MI);
12721274
}
12731275

12741276
static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI,
@@ -2177,6 +2179,29 @@ bool GCNHazardRecognizer::fixWMMACoexecutionHazards(MachineInstr *MI) {
21772179
return true;
21782180
}
21792181

2182+
/// This function inserts an s_wait_cnt(0) before every atomic store/RMW
2183+
/// operation to work around the write combining miss HW bug.
2184+
bool GCNHazardRecognizer::fixWriteCombiningHazzards(MachineInstr *MI) {
2185+
if (!SIInstrInfo::isAtomic(*MI) || !MI->mayStore())
2186+
return false;
2187+
2188+
// If the previous instruction is an s_wait_xcnt, and the count is 0, we don't
2189+
// need to do anything.
2190+
MachineBasicBlock &MBB = *MI->getParent();
2191+
auto Itr = MachineBasicBlock::iterator(MI);
2192+
auto PrevItr = std::prev(Itr);
2193+
if (Itr != MBB.begin() && (PrevItr->getOpcode() == AMDGPU::S_WAIT_XCNT_soft ||
2194+
PrevItr->getOpcode() == AMDGPU::S_WAIT_XCNT)) {
2195+
int64_t Cnt = PrevItr->getOperand(0).getImm();
2196+
if (Cnt == 0)
2197+
return false;
2198+
}
2199+
2200+
BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAIT_XCNT_soft))
2201+
.addImm(0);
2202+
return true;
2203+
}
2204+
21802205
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
21812206
if (!ST.hasShift64HighRegBug())
21822207
return false;

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
114114
bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI);
115115
bool fixScratchBaseForwardingHazard(MachineInstr *MI);
116116
bool fixSetRegMode(MachineInstr *MI);
117+
bool fixWriteCombiningHazzards(MachineInstr *MI);
117118

118119
int checkMAIHazards(MachineInstr *MI);
119120
int checkMAIHazards908(MachineInstr *MI);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
282282
bool HasPointSampleAccel = false;
283283
bool HasLdsBarrierArriveAtomic = false;
284284
bool HasSetPrioIncWgInst = false;
285-
285+
bool HasWriteCombiningHazzards = false;
286286
bool RequiresCOV6 = false;
287287
bool UseBlockVGPROpsForCSR = false;
288288
bool HasGloballyAddressableScratch = false;
@@ -1836,6 +1836,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
18361836
return getGeneration() == GFX12;
18371837
}
18381838

1839+
bool hasWriteCombiningHazzards() const { return HasWriteCombiningHazzards; }
1840+
18391841
// Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
18401842
// read.
18411843
bool hasScratchBaseForwardingHazard() const {

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 55 additions & 0 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/atomic_store_local.ll

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
4949
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
5050
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
5151
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
52+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
5253
; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v1
54+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
5355
; GFX1250-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1
5456
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
5557
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -59,7 +61,9 @@ define void @atomic_store_monotonic_i8(ptr addrspace(3) %ptr, i8 %val) {
5961
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
6062
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
6163
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
64+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
6265
; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v1
66+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
6367
; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v2
6468
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
6569
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -112,7 +116,9 @@ define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) {
112116
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
113117
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
114118
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
119+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
115120
; GFX1250-TRUE16-NEXT: ds_store_b8 v0, v1 offset:8
121+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
116122
; GFX1250-TRUE16-NEXT: ds_store_b8_d16_hi v0, v1 offset:16
117123
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
118124
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -122,7 +128,9 @@ define void @atomic_store_monotonic_offset_i8(ptr addrspace(3) %ptr, i8 %val) {
122128
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
123129
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
124130
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
131+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
125132
; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v1 offset:8
133+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
126134
; GFX1250-FAKE16-NEXT: ds_store_b8 v0, v2 offset:16
127135
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
128136
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -177,7 +185,9 @@ define void @atomic_store_monotonic_i16(ptr addrspace(3) %ptr, i16 %val) {
177185
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
178186
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
179187
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
188+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
180189
; GFX1250-TRUE16-NEXT: ds_store_b16 v0, v1
190+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
181191
; GFX1250-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1
182192
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
183193
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -187,7 +197,9 @@ define void @atomic_store_monotonic_i16(ptr addrspace(3) %ptr, i16 %val) {
187197
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
188198
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
189199
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
200+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
190201
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v1
202+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
191203
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v2
192204
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
193205
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -240,7 +252,9 @@ define void @atomic_store_monotonic_offset_i16(ptr addrspace(3) %ptr, i16 %val)
240252
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
241253
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
242254
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
255+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
243256
; GFX1250-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32
257+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
244258
; GFX1250-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32
245259
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
246260
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -250,7 +264,9 @@ define void @atomic_store_monotonic_offset_i16(ptr addrspace(3) %ptr, i16 %val)
250264
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
251265
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
252266
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
267+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
253268
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32
269+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
254270
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32
255271
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
256272
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -288,6 +304,7 @@ define void @atomic_store_monotonic_i32(ptr addrspace(3) %ptr, i32 %val) {
288304
; GFX1250: ; %bb.0:
289305
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
290306
; GFX1250-NEXT: s_wait_kmcnt 0x0
307+
; GFX1250-NEXT: s_wait_xcnt 0x0
291308
; GFX1250-NEXT: ds_store_b32 v0, v1
292309
; GFX1250-NEXT: s_wait_dscnt 0x0
293310
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -322,6 +339,7 @@ define void @atomic_store_monotonic_offset_i32(ptr addrspace(3) %ptr, i32 %val)
322339
; GFX1250: ; %bb.0:
323340
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
324341
; GFX1250-NEXT: s_wait_kmcnt 0x0
342+
; GFX1250-NEXT: s_wait_xcnt 0x0
325343
; GFX1250-NEXT: ds_store_b32 v0, v1 offset:64
326344
; GFX1250-NEXT: s_wait_dscnt 0x0
327345
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -358,6 +376,7 @@ define void @atomic_store_monotonic_i64(ptr addrspace(3) %ptr, i64 %val) {
358376
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
359377
; GFX1250-NEXT: s_wait_kmcnt 0x0
360378
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
379+
; GFX1250-NEXT: s_wait_xcnt 0x0
361380
; GFX1250-NEXT: ds_store_b64 v0, v[2:3]
362381
; GFX1250-NEXT: s_wait_dscnt 0x0
363382
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -393,6 +412,7 @@ define void @atomic_store_monotonic_offset_i64(ptr addrspace(3) %ptr, i64 %val)
393412
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
394413
; GFX1250-NEXT: s_wait_kmcnt 0x0
395414
; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
415+
; GFX1250-NEXT: s_wait_xcnt 0x0
396416
; GFX1250-NEXT: ds_store_b64 v0, v[2:3] offset:128
397417
; GFX1250-NEXT: s_wait_dscnt 0x0
398418
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -444,7 +464,9 @@ define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
444464
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
445465
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
446466
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
467+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
447468
; GFX1250-TRUE16-NEXT: ds_store_b16 v0, v1
469+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
448470
; GFX1250-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1
449471
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
450472
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -454,7 +476,9 @@ define void @atomic_store_monotonic_f16(ptr addrspace(3) %ptr, i16 %arg.val) {
454476
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
455477
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
456478
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
479+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
457480
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v1
481+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
458482
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v2
459483
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
460484
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -509,7 +533,9 @@ define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.v
509533
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
510534
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
511535
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
536+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
512537
; GFX1250-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32
538+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
513539
; GFX1250-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32
514540
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
515541
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -519,7 +545,9 @@ define void @atomic_store_monotonic_offset_f16(ptr addrspace(3) %ptr, i16 %arg.v
519545
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
520546
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
521547
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
548+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
522549
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32
550+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
523551
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32
524552
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
525553
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -575,7 +603,9 @@ define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
575603
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
576604
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
577605
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
606+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
578607
; GFX1250-TRUE16-NEXT: ds_store_b16 v0, v1
608+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
579609
; GFX1250-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1
580610
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
581611
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -585,7 +615,9 @@ define void @atomic_store_monotonic_bf16(ptr addrspace(3) %ptr, i16 %arg.val) {
585615
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
586616
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
587617
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
618+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
588619
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v1
620+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
589621
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v2
590622
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
591623
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
@@ -640,7 +672,9 @@ define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.
640672
; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
641673
; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
642674
; GFX1250-TRUE16-NEXT: v_add_nc_u16 v1.h, v1.l, 2
675+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
643676
; GFX1250-TRUE16-NEXT: ds_store_b16 v0, v1 offset:32
677+
; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
644678
; GFX1250-TRUE16-NEXT: ds_store_b16_d16_hi v0, v1 offset:32
645679
; GFX1250-TRUE16-NEXT: s_wait_dscnt 0x0
646680
; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
@@ -650,7 +684,9 @@ define void @atomic_store_monotonic_offset_bf16(ptr addrspace(3) %ptr, i16 %arg.
650684
; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
651685
; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
652686
; GFX1250-FAKE16-NEXT: v_add_nc_u16 v2, v1, 2
687+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
653688
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v1 offset:32
689+
; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
654690
; GFX1250-FAKE16-NEXT: ds_store_b16 v0, v2 offset:32
655691
; GFX1250-FAKE16-NEXT: s_wait_dscnt 0x0
656692
; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]

0 commit comments

Comments
 (0)