Skip to content

Commit 08d747c

Browse files
[AMDGPU] Fix bad removal of s_delay_alu (#145728)
instructionWaitsForSGPRWrites function covers ALL SALU instructions, including those like s_waitcnt that don't read from sgpr. This results in removing delay_alu instructions in cases like VALU->SGPR->VALU, which results in performance regression. Change modifies the function so that it checks if instruction also reads a sgpr.
1 parent 39f19f2 commit 08d747c

File tree

55 files changed

+808
-269
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+808
-269
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,17 @@ class AMDGPUInsertDelayAlu {
4949

5050
static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) {
5151
// These instruction types wait for VA_SDST==0 before issuing.
52-
const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD;
52+
uint64_t MIFlags = MI.getDesc().TSFlags;
53+
if (MIFlags & SIInstrFlags::SMRD)
54+
return true;
5355

54-
return MI.getDesc().TSFlags & VA_SDST_0;
56+
if (MIFlags & SIInstrFlags::SALU) {
57+
for (auto &Op : MI.operands()) {
58+
if (Op.isReg())
59+
return true;
60+
}
61+
}
62+
return false;
5563
}
5664

5765
// Types of delay that can be encoded in an s_delay_alu instruction.

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3070,6 +3070,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
30703070
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
30713071
; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
30723072
; GFX12-NEXT: s_wait_alu 0xfffd
3073+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
30733074
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
30743075
; GFX12-NEXT: flat_store_b32 v[0:1], v3
30753076
; GFX12-NEXT: s_endpgm
@@ -4161,6 +4162,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
41614162
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
41624163
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
41634164
; GFX12-NEXT: s_wait_alu 0xfffd
4165+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
41644166
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
41654167
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
41664168
; GFX12-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,11 +1025,12 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
10251025
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10261026
; GFX12-NEXT: v_mov_b32_e32 v2, v11
10271027
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
1028-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1028+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
10291029
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
10301030
; GFX12-NEXT: s_wait_alu 0xf1ff
10311031
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0
10321032
; GFX12-NEXT: s_wait_alu 0xfffd
1033+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10331034
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
10341035
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7]
10351036
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -2387,33 +2388,39 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
23872388
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
23882389
; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14
23892390
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
2390-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
2391+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
23912392
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
23922393
; GFX12-NEXT: s_wait_alu 0xf1ff
23932394
; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0
23942395
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
2396+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
23952397
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
23962398
; GFX12-NEXT: s_wait_alu 0xfffd
23972399
; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
23982400
; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
2401+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
23992402
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
24002403
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
24012404
; GFX12-NEXT: s_wait_alu 0xfffd
24022405
; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
2406+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24032407
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
24042408
; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
24052409
; GFX12-NEXT: s_wait_alu 0xfffd
2410+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24062411
; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
24072412
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
2408-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2413+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
24092414
; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
24102415
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
24112416
; GFX12-NEXT: s_wait_alu 0xf1ff
24122417
; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0
24132418
; GFX12-NEXT: v_mov_b32_e32 v20, v22
2419+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24142420
; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
24152421
; GFX12-NEXT: s_wait_alu 0xfffd
24162422
; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
2423+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
24172424
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
24182425
; GFX12-NEXT: v_mov_b32_e32 v19, v22
24192426
; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15
@@ -2434,6 +2441,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24342441
; GFX12-NEXT: s_wait_alu 0xf1ff
24352442
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
24362443
; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
2444+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
24372445
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
24382446
; GFX12-NEXT: s_wait_alu 0xf1ff
24392447
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2
@@ -2447,6 +2455,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24472455
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
24482456
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
24492457
; GFX12-NEXT: s_wait_alu 0xf1ff
2458+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
24502459
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
24512460
; GFX12-NEXT: s_wait_alu 0xf1ff
24522461
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
@@ -2463,9 +2472,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
24632472
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3
24642473
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1
24652474
; GFX12-NEXT: s_wait_alu 0xfffd
2466-
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
24672475
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2476+
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
24682477
; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0
2478+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
24692479
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
24702480
; GFX12-NEXT: s_setpc_b64 s[30:31]
24712481
%result = mul i256 %num, %den

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
240240
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
241241
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
242242
; GFX12W64-NEXT: s_wait_alu 0xf1ff
243+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
243244
; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
244245
; GFX12W64-NEXT: s_wait_kmcnt 0x0
245246
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -453,6 +454,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
453454
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
454455
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
455456
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
457+
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
456458
; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
457459
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
458460
; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -482,6 +484,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
482484
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
483485
; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
484486
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
487+
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
485488
; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5]
486489
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
487490
; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
@@ -514,6 +517,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
514517
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
515518
; GFX12W64-NEXT: s_wait_kmcnt 0x0
516519
; GFX12W64-NEXT: s_wait_alu 0xf1ff
520+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
517521
; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
518522
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
519523
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -544,6 +548,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
544548
; GFX12W32-NEXT: s_wait_loadcnt 0x0
545549
; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
546550
; GFX12W32-NEXT: s_wait_kmcnt 0x0
551+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
547552
; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5]
548553
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
549554
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
@@ -882,6 +887,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
882887
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
883888
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
884889
; GFX12W64-NEXT: s_wait_alu 0xf1ff
890+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
885891
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
886892
; GFX12W64-NEXT: s_wait_kmcnt 0x0
887893
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -926,6 +932,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
926932
; GFX12W32-NEXT: s_wait_loadcnt 0x0
927933
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
928934
; GFX12W32-NEXT: s_wait_alu 0xf1ff
935+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
929936
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
930937
; GFX12W32-NEXT: s_wait_kmcnt 0x0
931938
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1285,6 +1292,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
12851292
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
12861293
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
12871294
; GFX12W64-NEXT: s_wait_alu 0xf1ff
1295+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
12881296
; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0
12891297
; GFX12W64-NEXT: s_wait_kmcnt 0x0
12901298
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1331,6 +1339,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
13311339
; GFX12W32-NEXT: s_wait_loadcnt 0x0
13321340
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
13331341
; GFX12W32-NEXT: s_wait_alu 0xf1ff
1342+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
13341343
; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0
13351344
; GFX12W32-NEXT: s_wait_kmcnt 0x0
13361345
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -1968,6 +1977,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
19681977
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
19691978
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
19701979
; GFX12W64-NEXT: s_wait_alu 0xf1ff
1980+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
19711981
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
19721982
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
19731983
; GFX12W64-NEXT: s_endpgm
@@ -2000,6 +2010,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
20002010
; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
20012011
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
20022012
; GFX12W32-NEXT: s_wait_alu 0xf1ff
2013+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
20032014
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
20042015
; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
20052016
; GFX12W32-NEXT: s_endpgm
@@ -2338,6 +2349,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
23382349
; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
23392350
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
23402351
; GFX12W64-NEXT: s_wait_alu 0xf1ff
2352+
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
23412353
; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
23422354
; GFX12W64-NEXT: s_wait_kmcnt 0x0
23432355
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
@@ -2383,6 +2395,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
23832395
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
23842396
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
23852397
; GFX12W32-NEXT: s_wait_alu 0xf1ff
2398+
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
23862399
; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
23872400
; GFX12W32-NEXT: s_wait_kmcnt 0x0
23882401
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3390,6 +3390,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
33903390
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31
33913391
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
33923392
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
3393+
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
33933394
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
33943395
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
33953396
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -3445,6 +3446,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
34453446
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5
34463447
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7
34473448
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
3449+
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
34483450
; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8
34493451
; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
34503452
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -6954,6 +6956,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
69546956
; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31
69556957
; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
69566958
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
6959+
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
69576960
; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
69586961
; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0
69596962
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -7009,6 +7012,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
70097012
; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5
70107013
; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7
70117014
; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff
7015+
; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
70127016
; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8
70137017
; GFX1264_DPP-NEXT: s_mov_b32 s2, s6
70147018
; GFX1264_DPP-NEXT: s_wait_alu 0xfffd
@@ -8233,6 +8237,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
82338237
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
82348238
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
82358239
; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff
8240+
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
82368241
; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
82378242
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
82388243
; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -8298,6 +8303,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
82988303
; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
82998304
; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
83008305
; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
8306+
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
83018307
; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2
83028308
; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1
83038309
; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -8364,6 +8370,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
83648370
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
83658371
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
83668372
; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff
8373+
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
83678374
; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
83688375
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
83698376
; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -8429,6 +8436,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
84298436
; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
84308437
; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
84318438
; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
8439+
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
84328440
; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2
84338441
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
84348442
; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null
@@ -8818,7 +8826,7 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
88188826
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
88198827
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
88208828
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8821-
; GFX7LESS-NEXT: ; implicit-def: $vgpr0
8829+
; GFX7LESS-NEXT: ; implicit-def: $vgpr0
88228830
; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
88238831
; GFX7LESS-NEXT: s_cbranch_execz .LBB15_2
88248832
; GFX7LESS-NEXT: ; %bb.1:
@@ -9328,7 +9336,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
93289336
; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
93299337
; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0
93309338
; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
9331-
; GFX7LESS-NEXT: ; implicit-def: $vgpr0
9339+
; GFX7LESS-NEXT: ; implicit-def: $vgpr0
93329340
; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc
93339341
; GFX7LESS-NEXT: s_cbranch_execz .LBB16_4
93349342
; GFX7LESS-NEXT: ; %bb.1:
@@ -9931,6 +9939,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
99319939
; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
99329940
; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
99339941
; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff
9942+
; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
99349943
; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2
99359944
; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1
99369945
; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -9996,6 +10005,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
999610005
; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
999710006
; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
999810007
; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
10008+
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
999910009
; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2
1000010010
; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1
1000110011
; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10062,6 +10072,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
1006210072
; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
1006310073
; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0
1006410074
; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff
10075+
; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1006510076
; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2
1006610077
; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1
1006710078
; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -10127,6 +10138,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
1012710138
; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
1012810139
; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0
1012910140
; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
10141+
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1013010142
; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2
1013110143
; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1
1013210144
; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null
@@ -12703,6 +12715,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
1270312715
; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
1270412716
; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0
1270512717
; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd
12718+
; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1270612719
; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc
1270712720
; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff
1270812721
; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1]
@@ -12816,6 +12829,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
1281612829
; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
1281712830
; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0
1281812831
; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd
12832+
; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1281912833
; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
1282012834
; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff
1282112835
; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0

0 commit comments

Comments
 (0)