diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 10efbab805676..383b5c0e9dcd5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -49,9 +49,17 @@ class AMDGPUInsertDelayAlu { static bool instructionWaitsForSGPRWrites(const MachineInstr &MI) { // These instruction types wait for VA_SDST==0 before issuing. - const uint64_t VA_SDST_0 = SIInstrFlags::SALU | SIInstrFlags::SMRD; + uint64_t MIFlags = MI.getDesc().TSFlags; + if (MIFlags & SIInstrFlags::SMRD) + return true; - return MI.getDesc().TSFlags & VA_SDST_0; + if (MIFlags & SIInstrFlags::SALU) { + for (auto &Op : MI.operands()) { + if (Op.isReg()) + return true; + } + } + return false; } // Types of delay that can be encoded in an s_delay_alu instruction. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index cbadd1eb431fc..788a4e6fb2141 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -3070,6 +3070,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: flat_store_b32 v[0:1], v3 ; GFX12-NEXT: s_endpgm @@ -4161,6 +4162,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr % ; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index dd37e855c4a3f..a224c8b391323 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1025,11 +1025,12 @@ define i128 @v_mul_i128(i128 %num, i128 %den) { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mov_b32_e32 v2, v11 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v12, v7, s0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[6:7] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -2387,33 +2388,39 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], null, v16, v12, 0 ; GFX12-NEXT: v_mul_lo_u32 v30, v17, v14 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v20, 0, 1, s0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[20:21], null, v16, v10, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19] ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1] ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cndmask_b32_e64 v25, 0, 1, s0 ; GFX12-NEXT: v_mov_b32_e32 v20, v22 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20] ; GFX12-NEXT: v_mov_b32_e32 v19, v22 ; GFX12-NEXT: v_mul_lo_u32 v22, v16, v15 @@ -2434,6 +2441,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 ; GFX12-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12] ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s2 @@ -2447,6 +2455,7 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11] ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13] ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v3, s5 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v4, s5 @@ -2463,9 +2472,10 @@ define i256 @v_mul_i256(i256 %num, i256 %den) { ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v25, s3 ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v20, s1 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v27, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10] ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = mul i256 %num, %den diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index 8319e112f526e..4b68f8a4bd194 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -240,6 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -453,6 +454,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] @@ -482,6 +484,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] @@ -514,6 +517,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -544,6 +548,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -882,6 +887,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -926,6 +932,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1285,6 +1292,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1331,6 +1339,7 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1968,6 +1977,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -2000,6 +2010,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_endpgm @@ -2338,6 +2349,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2383,6 +2395,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index f979d01e495ba..70211c302829c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -3390,6 +3390,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -3445,6 +3446,7 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_add_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -6954,6 +6956,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_readlane_b32 s2, v2, 31 ; GFX1264_DPP-NEXT: v_mov_b32_dpp v3, v4 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264_DPP-NEXT: v_add_co_u32_e64_dpp v2, vcc, v2, s2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf ; GFX1264_DPP-NEXT: v_mov_b32_e32 v4, 0 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -7009,6 +7012,7 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1264_DPP-NEXT: v_mov_b32_e32 v9, v5 ; GFX1264_DPP-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1264_DPP-NEXT: s_wait_alu 0xf1ff +; GFX1264_DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1264_DPP-NEXT: v_sub_co_u32 v6, vcc, s2, v8 ; GFX1264_DPP-NEXT: s_mov_b32 s2, s6 ; GFX1264_DPP-NEXT: s_wait_alu 0xfffd @@ -8233,6 +8237,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8298,6 +8303,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8364,6 +8370,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8429,6 +8436,7 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac ; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null @@ -8818,7 +8826,7 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB15_2 ; GFX7LESS-NEXT: ; %bb.1: @@ -9328,7 +9336,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr0 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_4 ; GFX7LESS-NEXT: ; %bb.1: @@ -9931,6 +9939,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9996,6 +10005,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_mad_u16 v0, s10, v4, s2 ; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10062,6 +10072,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10127,6 +10138,7 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_mad_u16 v0, s8, v4, s2 ; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -12703,6 +12715,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1264-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1264-FAKE16-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 ; GFX1264-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc ; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s[0:1] @@ -12816,6 +12829,7 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add ; GFX1232-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff ; GFX1232-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v0, v0 ; GFX1232-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, v3, v5, s0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 8e0b3cb9aa1d5..17737cccec7c4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -405,6 +405,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive) ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 @@ -1901,6 +1902,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1164-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] @@ -1937,6 +1939,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive) ; GFX1132-NEXT: v_readfirstlane_b32 s5, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s4, v0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5] ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 6a82dbeec5e2f..0a06fe4ea949e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -239,6 +239,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -452,6 +453,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] @@ -481,6 +483,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] @@ -513,6 +516,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -543,6 +547,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -881,6 +886,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -925,6 +931,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1562,6 +1569,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -1594,6 +1602,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_endpgm @@ -1932,6 +1941,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1977,6 +1987,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index dd4c0b0625ea8..bc0bec4772e52 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -247,6 +247,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -466,6 +467,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W64-NEXT: s_waitcnt vmcnt(0) ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3] ; GFX11W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1] @@ -496,6 +498,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX11W32-NEXT: s_waitcnt vmcnt(0) ; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s0, v0, s[4:5] ; GFX11W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3] @@ -529,6 +532,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -559,6 +563,7 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[4:5] ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] @@ -904,6 +909,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -948,6 +954,7 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: s_wait_loadcnt 0x0 ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1729,6 +1736,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -1761,6 +1769,7 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3] ; GFX12W32-NEXT: s_endpgm @@ -2106,6 +2115,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu 0xf1ff +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] @@ -2151,6 +2161,7 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_wait_alu 0xf1ff +; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 3eba106b861c6..348862d4d8ced 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -387,6 +387,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2381,6 +2382,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -2411,6 +2413,7 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -4307,6 +4310,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -4341,6 +4345,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -4388,6 +4393,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -4422,6 +4428,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6132,6 +6139,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6178,6 +6186,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6225,6 +6234,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6270,6 +6280,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -7485,6 +7496,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -9903,6 +9915,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterf ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index c17225594164f..ab867b089b875 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -379,6 +379,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -1591,6 +1592,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -1623,6 +1625,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3415,6 +3418,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3451,6 +3455,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3498,6 +3503,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3535,6 +3541,7 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5264,6 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5310,6 +5318,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5357,6 +5366,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5402,6 +5412,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6775,6 +6786,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6807,6 +6819,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -7995,6 +8008,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 @@ -8441,6 +8455,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -8492,6 +8507,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -8532,6 +8548,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -8582,6 +8599,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 56719dccbd08a..1a25904dd553f 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -379,6 +379,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__offset__waterfall__amdgp ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -1591,6 +1592,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -1623,6 +1625,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__offset__waterfall__amdg ; GFX12-NEXT: v_readfirstlane_b32 s6, v7 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3415,6 +3418,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3451,6 +3455,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3498,6 +3503,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -3535,6 +3541,7 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5264,6 +5271,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5310,6 +5318,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5357,6 +5366,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -5402,6 +5412,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6775,6 +6786,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -6807,6 +6819,7 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fmin_ret_v2f16__offset__waterfall ; GFX12-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -7995,6 +8008,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_v2bf16__offset__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s4, v0, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4 @@ -8441,6 +8455,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -8492,6 +8507,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -8532,6 +8548,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -8582,6 +8599,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmin_ret_v2bf16__offset__waterf ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index f0e7cba6924d8..ce4db2f84774b 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -48,15 +48,17 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: s_branch .LBB0_2 ; GFX1100-NEXT: .LBB0_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-NEXT: s_xor_b32 s1, s1, -1 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_cbranch_vccz .LBB0_4 ; GFX1100-NEXT: .LBB0_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 @@ -139,15 +141,17 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: s_branch .LBB1_2 ; GFX1100-NEXT: .LBB1_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-NEXT: s_xor_b32 s1, s1, -1 ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 ; GFX1100-NEXT: s_cbranch_vccz .LBB1_4 ; GFX1100-NEXT: .LBB1_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB1_1 @@ -239,6 +243,7 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1100-NEXT: s_cbranch_vccz .LBB2_4 ; GFX1100-NEXT: .LBB2_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr2 ; GFX1100-NEXT: s_cbranch_vccnz .LBB2_1 @@ -335,6 +340,7 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1100-NEXT: s_cbranch_vccz .LBB3_4 ; GFX1100-NEXT: .LBB3_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr2 ; GFX1100-NEXT: s_cbranch_vccnz .LBB3_1 @@ -421,15 +427,16 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: s_branch .LBB4_2 ; GFX1100-NEXT: .LBB4_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: s_and_b32 s1, s1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1100-NEXT: .LBB4_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB4_1 @@ -512,15 +519,16 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: s_branch .LBB5_2 ; GFX1100-NEXT: .LBB5_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: s_and_b32 s1, s1, vcc_lo +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1100-NEXT: .LBB5_2: ; %.a ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0 ; GFX1100-NEXT: ; implicit-def: $sgpr1 ; GFX1100-NEXT: s_cbranch_vccnz .LBB5_1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index cfe4d24d427e7..8581e4d030261 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -12426,13 +12426,14 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12483,13 +12484,14 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12890,13 +12892,14 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12947,13 +12950,14 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13355,13 +13359,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13410,13 +13415,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13804,13 +13810,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13859,13 +13866,14 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15417,13 +15425,14 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15475,13 +15484,14 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15885,13 +15895,14 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15941,13 +15952,14 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index d2cbc25bf7e04..883063b5471ca 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6474,13 +6474,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6882,13 +6883,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7671,13 +7673,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8068,13 +8071,14 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9068,13 +9072,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9479,13 +9484,14 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10283,13 +10289,14 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10340,13 +10347,14 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10748,13 +10756,14 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10805,13 +10814,14 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11648,13 +11658,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11703,13 +11714,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12098,13 +12110,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12153,13 +12166,14 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13281,13 +13295,14 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13339,13 +13354,14 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13750,13 +13766,14 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13806,13 +13823,14 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17588,6 +17606,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18024,6 +18043,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18467,6 +18487,7 @@ define void @flat_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -19393,6 +19414,7 @@ define void @flat_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index 805848fc3e1cc..c603421ca15b4 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6474,13 +6474,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6882,13 +6883,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7671,13 +7673,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8068,13 +8071,14 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9068,13 +9072,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9479,13 +9484,14 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10283,13 +10289,14 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10340,13 +10347,14 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10748,13 +10756,14 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10805,13 +10814,14 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11648,13 +11658,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11703,13 +11714,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12098,13 +12110,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12153,13 +12166,14 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13281,13 +13295,14 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13339,13 +13354,14 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13750,13 +13766,14 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13806,13 +13823,14 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17588,6 +17606,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memory( ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18024,6 +18043,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18467,6 +18487,7 @@ define void @flat_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -19393,6 +19414,7 @@ define void @flat_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index e0138d58963c8..c987effec3be3 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -9871,13 +9871,14 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9928,13 +9929,14 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10335,13 +10337,14 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10392,13 +10395,14 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11233,13 +11237,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11288,13 +11293,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11682,13 +11688,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11737,13 +11744,14 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12862,13 +12870,14 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12920,13 +12929,14 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13330,13 +13340,14 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13386,13 +13397,14 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: flat_load_b32 v3, v[0:1] ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17023,6 +17035,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -17459,6 +17472,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -17902,6 +17916,7 @@ define void @flat_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18828,6 +18843,7 @@ define void @flat_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 8c75b5c7c027e..2f08931f2287e 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -2744,11 +2744,11 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo @@ -2760,7 +2760,7 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo @@ -2778,38 +2778,38 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo @@ -2821,7 +2821,7 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index fd7c7006b3612..969c6c3980fc3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -2749,11 +2749,11 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo @@ -2765,7 +2765,7 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo @@ -2783,38 +2783,38 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo @@ -2826,7 +2826,7 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll index 0959687d3834c..3e2680f55832d 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -75,6 +75,7 @@ define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index 4ee48716439bd..d97ea042b50fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -118,6 +118,7 @@ define float @safe_math_fract_f32(float %x, ptr addrspace(1) writeonly captures( ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -229,6 +230,7 @@ define float @safe_math_fract_f32_swap(float %x, ptr addrspace(1) writeonly capt ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2422,6 +2424,7 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| ; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 ; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -2569,6 +2572,7 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX12-TRUE16-NEXT: v_cmp_neq_f16_e64 s0, 0x7c00, |v0.l| ; GFX12-TRUE16-NEXT: v_floor_f16_e32 v3.l, v0.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, 0, v0.h, s0 ; GFX12-TRUE16-NEXT: global_store_b16 v[1:2], v3, off ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -2584,6 +2588,7 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no ; GFX12-FAKE16-NEXT: v_cmp_neq_f16_e64 vcc_lo, 0x7c00, |v0| ; GFX12-FAKE16-NEXT: v_floor_f16_e32 v4, v0 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-FAKE16-NEXT: global_store_b16 v[1:2], v4, off ; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -2804,14 +2809,14 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) writeon ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-FAKE16-NEXT: v_fract_f16_e32 v6, v0 ; GFX12-FAKE16-NEXT: v_floor_f16_e32 v5, v0 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_fract_f16_e32 v4, v3 ; GFX12-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v3, 0x204 ; GFX12-FAKE16-NEXT: v_floor_f16_e32 v7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 ; GFX12-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_pack_b32_f16 v4, v5, v7 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0 @@ -3093,6 +3098,7 @@ define float @safe_math_fract_f32_minimum(float %x, ptr addrspace(1) writeonly c ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3207,6 +3213,7 @@ define float @safe_math_fract_f32_minimum_swap(float %x, ptr addrspace(1) writeo ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3318,6 +3325,7 @@ define float @safe_math_fract_f32_minimumnum(float %x, ptr addrspace(1) writeonl ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -3429,6 +3437,7 @@ define float @safe_math_fract_f32_minimumnum_swap(float %x, ptr addrspace(1) wri ; GFX12-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0| ; GFX12-NEXT: v_floor_f32_e32 v4, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc_lo ; GFX12-NEXT: global_store_b32 v[1:2], v4, off ; GFX12-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 0ddb181decb94..100a560c1d127 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -166,6 +166,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -351,6 +352,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -542,6 +544,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -727,6 +730,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -907,6 +911,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1092,6 +1097,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1284,6 +1290,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1469,6 +1476,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1670,6 +1678,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1872,6 +1881,7 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2068,6 +2078,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2253,6 +2264,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2425,6 +2437,7 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2627,6 +2640,7 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -2823,6 +2837,7 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3004,6 +3019,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory( ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3159,6 +3175,7 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3354,6 +3371,7 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3554,6 +3572,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3747,6 +3766,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -3946,6 +3966,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4139,6 +4160,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4322,6 +4344,7 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4501,6 +4524,7 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4674,6 +4698,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4845,6 +4870,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5022,6 +5048,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5181,6 +5208,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5335,6 +5363,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5494,6 +5523,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5672,6 +5702,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -5831,6 +5862,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6002,6 +6034,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6158,6 +6191,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6358,6 +6392,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6551,6 +6586,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6720,6 +6756,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -6873,6 +6910,7 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7091,6 +7129,7 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7312,6 +7351,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7538,6 +7578,7 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7750,6 +7791,7 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -7955,6 +7997,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8165,6 +8208,7 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -8585,6 +8629,7 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9019,6 +9064,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9457,6 +9503,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -9868,6 +9915,7 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -10286,6 +10334,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -10707,6 +10756,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -11039,6 +11089,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -11350,6 +11401,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -11780,6 +11832,7 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -12206,6 +12259,7 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -12708,6 +12762,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -12767,13 +12822,14 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12824,13 +12880,14 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13220,6 +13277,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -13282,13 +13340,14 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13339,13 +13398,14 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13736,6 +13796,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -14221,6 +14282,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -14279,13 +14341,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14334,13 +14397,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14717,6 +14781,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -14777,13 +14842,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -14832,13 +14898,14 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -15216,6 +15283,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -15633,6 +15701,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -16029,6 +16098,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -16080,13 +16150,14 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -16138,13 +16209,14 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -16537,6 +16609,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -16599,13 +16672,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -16655,13 +16729,14 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17041,6 +17116,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -17269,6 +17345,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -17500,6 +17577,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_ ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -17734,6 +17812,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -17947,6 +18026,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -18157,6 +18237,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -18372,6 +18453,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -18609,6 +18691,7 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -18825,6 +18908,7 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -19068,6 +19152,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -19304,6 +19389,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -19532,6 +19618,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -19742,6 +19829,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -19984,6 +20072,7 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac ; GFX7-NEXT: v_mov_b32_e32 v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -20220,6 +20309,7 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -20604,6 +20694,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -20983,6 +21074,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_ ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -21365,6 +21457,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_ ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -21736,6 +21829,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -22104,6 +22198,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -22477,6 +22572,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -22862,6 +22958,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -23236,6 +23333,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -23613,6 +23711,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -23981,6 +24080,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -24357,6 +24457,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -24725,6 +24826,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -25101,6 +25203,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs ; GFX7-NEXT: v_mov_b32_e32 v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v1, v2 ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -25469,6 +25572,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] +; ; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index a24d6c5ff2222..faa3ee61427a2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4948,13 +4948,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5408,13 +5409,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6298,13 +6300,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6745,13 +6748,14 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7871,13 +7875,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8334,13 +8339,14 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9239,13 +9245,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9296,13 +9303,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9757,13 +9765,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9814,13 +9823,14 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10760,13 +10770,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10815,13 +10826,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11261,13 +11273,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11316,13 +11329,14 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12573,13 +12587,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12631,13 +12646,14 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13095,13 +13111,14 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13151,13 +13168,14 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17531,6 +17549,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18018,6 +18037,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18508,6 +18528,7 @@ define void @global_agent_atomic_fmax_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -19517,6 +19538,7 @@ define void @global_system_atomic_fmax_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index 5834d4ab4d8e7..cb66f85ff3ae2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4948,13 +4948,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB27_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5408,13 +5409,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB28_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6298,13 +6300,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB30_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -6745,13 +6748,14 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB31_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -7871,13 +7875,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v2, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -8334,13 +8339,14 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_max_num_f16_e32 v6, v2, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB35_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9239,13 +9245,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9296,13 +9303,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9757,13 +9765,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9814,13 +9823,14 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10760,13 +10770,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10815,13 +10826,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11261,13 +11273,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11316,13 +11329,14 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12573,13 +12587,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -12631,13 +12646,14 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13095,13 +13111,14 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13151,13 +13168,14 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17531,6 +17549,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__amdgpu_no_fine_grained_memor ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18018,6 +18037,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18508,6 +18528,7 @@ define void @global_agent_atomic_fmin_noret_v2bf16__offset12b_neg__amdgpu_no_fin ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -19517,6 +19538,7 @@ define void @global_system_atomic_fmin_noret_v2bf16__offset12b_pos__amdgpu_no_fi ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 765185327a03e..f869b5778bfb2 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -9768,13 +9768,14 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -9825,13 +9826,14 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10284,13 +10286,14 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -10341,13 +10344,14 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11283,13 +11287,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11338,13 +11343,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11782,13 +11788,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -11837,13 +11844,14 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13088,13 +13096,14 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13146,13 +13155,14 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-FAKE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13608,13 +13618,14 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -13664,13 +13675,14 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd ; GFX12-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, -4, v4 ; GFX12-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX12-FAKE16-NEXT: global_load_b32 v3, v[0:1], off ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_not_b32_e32 v5, v5 ; GFX12-FAKE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -17898,6 +17910,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18385,6 +18398,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace( ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -18875,6 +18889,7 @@ define void @global_agent_atomic_fsub_noret_v2bf16__offset12b_neg(ptr addrspace( ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 @@ -19884,6 +19899,7 @@ define void @global_system_atomic_fsub_noret_v2bf16__offset12b_pos(ptr addrspace ; GFX12-FAKE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v2, v2 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 81fc554a8857f..a3b0a7768ca67 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -1064,6 +1064,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 678d06e969276..546144dac6470 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -5437,13 +5437,14 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v7, v4, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v5, v2 -; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v1, v7, v6 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v6, v4, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v2, 0 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd @@ -5451,16 +5452,16 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v5, v2 ; GFX1200-SDAG-NEXT: v_add3_u32 v4, v4, v6, v2 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v3, v1 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v5, v4, v0 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v0, v[3:4] +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v5, v1, v2 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v2, v0, v4 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v4, v1, v3 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v3, v[0:1] -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-NEXT: v_add3_u32 v1, v4, v1, v2 ; GFX1200-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5474,18 +5475,19 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v4, v2 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v6, v4, v2 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v3, v[0:1] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v4, v2 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v2, v[0:1] ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v4, v2 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v0, v5, vcc_lo +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v4, v3, v[1:2] ; GFX1200-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v6, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd @@ -5494,15 +5496,16 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) { ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v7, v4 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v5, v2, v[3:4] ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v5, v7, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v7, v6, v[0:1] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v5, v8 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v4, v[2:3] -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, v[0:1] ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v0, v5, v8 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v8, v[2:3] ; GFX1200-GISEL-NEXT: s_setpc_b64 s[30:31] entry: @@ -6288,18 +6291,19 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-SDAG-NEXT: v_add_co_u32 v10, vcc_lo, v2, 1 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v11, null, 0, v3, vcc_lo +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v12, v9, v4 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v13, v8, v5 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v14, v11, v6 ; GFX1200-SDAG-NEXT: v_mul_lo_u32 v15, v10, v7 ; GFX1200-SDAG-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v6, 0 +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add3_u32 v12, v1, v13, v12 ; GFX1200-SDAG-NEXT: v_add_co_u32 v1, vcc_lo, v0, v8 -; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add3_u32 v13, v3, v15, v14 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX1200-SDAG-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v9, vcc_lo ; GFX1200-SDAG-NEXT: v_add_co_u32 v8, vcc_lo, v2, v10 ; GFX1200-SDAG-NEXT: s_wait_alu 0xfffd @@ -6348,6 +6352,7 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: v_add_co_u32 v9, s0, v2, 1 ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v1, vcc_lo +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v0, v8, v4 ; GFX1200-GISEL-NEXT: v_mul_hi_u32 v1, v9, v6 ; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff @@ -6368,14 +6373,16 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) { ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v16, null, v1, v10, vcc_lo ; GFX1200-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v11, null, v2, v11, s0 ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[8:9], null, v14, v5, v[0:1] +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[9:10], null, v15, v7, v[3:4] ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v10, v15, v6 ; GFX1200-GISEL-NEXT: v_mul_lo_u32 v7, v14, v4 -; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[3:4], null, v16, v4, v[8:9] ; GFX1200-GISEL-NEXT: v_add_co_u32 v8, vcc_lo, v12, 1 +; GFX1200-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX1200-GISEL-NEXT: v_mad_co_u64_u32 v[4:5], null, v11, v6, v[9:10] ; GFX1200-GISEL-NEXT: s_wait_alu 0xfffd ; GFX1200-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll index c2f8c2c44316a..2964f07c285f1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll @@ -121,11 +121,12 @@ define amdgpu_cs_chain void @wwm_in_shader(<3 x i32> inreg %sgpr, ptr inreg %cal ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v10, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v10 ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -240,11 +241,12 @@ define amdgpu_cs_chain void @phi_whole_struct(<3 x i32> inreg %sgpr, ptr inreg % ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v0 :: v_dual_add_nc_u32 v10, 42, v12 ; GISEL12-NEXT: ; %bb.2: ; %tail ; GISEL12-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -361,12 +363,13 @@ define amdgpu_cs_chain void @control_flow(<3 x i32> inreg %sgpr, ptr inreg %call ; GISEL12-NEXT: s_or_saveexec_b32 s8, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v1, s8 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s9, 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s9 ; GISEL12-NEXT: s_mov_b32 exec_lo, s8 ; GISEL12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v13, v1 +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GISEL12-NEXT: v_mov_b32_e32 v11, v0 ; GISEL12-NEXT: s_or_b32 s4, vcc_lo, s4 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -611,11 +614,12 @@ define amdgpu_cs_chain void @use_v0_7(<3 x i32> inreg %sgpr, ptr inreg %callee, ; GISEL12-NEXT: s_or_saveexec_b32 s4, -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v13, 0x47, v12, s4 -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s8, 0, v13 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v13, s8 ; GISEL12-NEXT: s_mov_b32 exec_lo, s4 +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GISEL12-NEXT: v_dual_mov_b32 v11, v13 :: v_dual_add_nc_u32 v10, 42, v12 ; GISEL12-NEXT: ;;#ASMSTART ; GISEL12-NEXT: ; use v0-7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index baa904878310b..362b18f65e582 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -26,10 +26,11 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] -; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GISEL12-NEXT: v_cmp_ne_u32_e64 s[12:13], 0, v0 ; GISEL12-NEXT: s_wait_alu 0xf1ff ; GISEL12-NEXT: v_mov_b32_e32 v0, s12 +; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GISEL12-NEXT: v_mov_b32_e32 v1, s13 ; GISEL12-NEXT: s_mov_b64 exec, s[10:11] ; GISEL12-NEXT: v_mov_b32_e32 v11, v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index d05501fb0c8b7..8506e75fe9680 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -801,12 +801,13 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -833,13 +834,14 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -856,6 +858,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 % ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -976,12 +979,13 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -1008,13 +1012,14 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -1031,6 +1036,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -1167,7 +1173,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -1317,7 +1323,7 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -1470,7 +1476,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 % ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -1658,7 +1664,7 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -3360,12 +3366,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -3392,13 +3399,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3415,6 +3423,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3462,12 +3471,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_endpgm @@ -3494,13 +3504,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 ; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-SDAG-NEXT: s_endpgm @@ -3517,6 +3528,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 ; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-GISEL-NEXT: s_endpgm @@ -3726,7 +3738,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -3804,7 +3816,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -4029,7 +4041,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -4111,7 +4123,7 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff @@ -8559,6 +8571,7 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8597,6 +8610,7 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8635,6 +8649,7 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1, ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8673,6 +8688,7 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8711,6 +8727,7 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 % ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8749,6 +8766,7 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b16 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8787,6 +8805,7 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8825,6 +8844,7 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s ; GFX12-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -8886,6 +8906,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -8901,6 +8922,7 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -8963,6 +8985,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -8978,6 +9001,7 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 % ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -9066,6 +9090,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 @@ -9088,6 +9113,7 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9183,6 +9209,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 @@ -9205,6 +9232,7 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v9 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v10 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -9282,6 +9310,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9299,6 +9328,7 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9371,6 +9401,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9388,6 +9419,7 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -9460,6 +9492,7 @@ define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 @@ -9477,6 +9510,7 @@ define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9563,6 +9597,7 @@ define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 @@ -9584,6 +9619,7 @@ define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9682,6 +9718,7 @@ define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v9, v9, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 @@ -9705,6 +9742,7 @@ define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 % ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9845,6 +9883,7 @@ define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v17, v17, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v16, v16, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v15, v15, s0, s1 @@ -9878,6 +9917,7 @@ define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 % ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlane16_b32 v4, v4, s0, s1 @@ -9966,6 +10006,7 @@ define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 @@ -9983,6 +10024,7 @@ define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v6 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v7 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -10069,6 +10111,7 @@ define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 @@ -10090,6 +10133,7 @@ define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %sr ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -10188,6 +10232,7 @@ define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v10 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v11 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v9, v9, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v8, v8, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 @@ -10211,6 +10256,7 @@ define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v10 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v11 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 @@ -10351,6 +10397,7 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v18 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v19 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v17, v17, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v16, v16, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v15, v15, s0, s1 @@ -10384,6 +10431,7 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v18 ; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v19 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-GISEL-NEXT: v_permlanex16_b32 v4, v4, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll index 8eab7e2fc62fa..10c000095fe3d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ptr.ll @@ -35,6 +35,7 @@ define void @v_permlane16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -76,6 +77,7 @@ define void @v_permlanex16_p0(ptr addrspace(1) %out, ptr %src0, i32 %src1, i32 % ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v4 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v5 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -128,6 +130,7 @@ define void @v_permlane16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src1 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v6, v6, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v5, v5, s0, s1 @@ -186,6 +189,7 @@ define void @v_permlanex16_v3p0(ptr addrspace(1) %out, <3 x ptr> %src0, i32 %src ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v8 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v9 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v7, v7, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v6, v6, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v5, v5, s0, s1 @@ -231,6 +235,7 @@ define void @v_permlane16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -269,6 +274,7 @@ define void @v_permlanex16_p3(ptr addrspace(1) %out, ptr addrspace(3) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -311,6 +317,7 @@ define void @v_permlane16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -355,6 +362,7 @@ define void @v_permlanex16_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -395,6 +403,7 @@ define void @v_permlane16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -433,6 +442,7 @@ define void @v_permlanex16_p5(ptr addrspace(1) %out, ptr addrspace(5) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -475,6 +485,7 @@ define void @v_permlane16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -519,6 +530,7 @@ define void @v_permlanex16_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 @@ -559,6 +571,7 @@ define void @v_permlane16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -597,6 +610,7 @@ define void @v_permlanex16_p6(ptr addrspace(1) %out, ptr addrspace(6) %src0, i32 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v3 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 ; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v2, off ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -639,6 +653,7 @@ define void @v_permlane16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %sr ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlane16_b32 v2, v2, s0, s1 @@ -683,6 +698,7 @@ define void @v_permlanex16_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> %s ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v5 ; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v6 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_b32 v4, v4, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v3, v3, s0, s1 ; GFX12-SDAG-NEXT: v_permlanex16_b32 v2, v2, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll index 6c032ed061544..8b6ba1a3cc094 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.fadd.v2bf16.ll @@ -80,6 +80,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsrc__ ; GFX12-NEXT: v_readfirstlane_b32 s3, v6 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v6 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll index 701b80d59bcc6..74d5274f22e05 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll @@ -152,6 +152,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-TRUE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -180,6 +181,7 @@ define amdgpu_gs void @main(<4 x i32> %arg, i32 %arg1) { ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s6, v2 ; GFX12-FAKE16-NEXT: v_readfirstlane_b32 s7, v3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] ; GFX12-FAKE16-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3] ; GFX12-FAKE16-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll index 2c9f9a6ca4d55..148a5ba75d98b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16.ll @@ -49,6 +49,7 @@ define <2 x bfloat> @struct_ptr_buffer_atomic_add_v2bf16_rtn__vgpr_val__vgpr_rsr ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -88,6 +89,7 @@ define void @struct_ptr_buffer_atomic_add_v2bf16_noret__vgpr_val__vgpr_rsrc__vgp ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll index a86ad8ede2f2c..746b8791c39f9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_nortn.ll @@ -273,6 +273,7 @@ define void @struct_ptr_buffer_atomic_add_f32_noret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -410,6 +411,7 @@ define void @struct_ptr_buffer_atomic_add_v2f16_noret__vgpr_val__vgpr_rsrc__vgpr ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll index a3bdcbe17cc76..71c63bfd69734 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fadd_rtn.ll @@ -218,6 +218,7 @@ define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 @@ -326,6 +327,7 @@ define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__ ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7 ; GFX1200-NEXT: s_wait_alu 0xf1ff ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll index db6e0ad670747..e3889ab8f5a21 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f32.ll @@ -456,6 +456,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__vgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 @@ -610,6 +611,7 @@ define float @struct_ptr_buffer_atomic_add_f32_ret__vgpr_val__sgpr_rsrc__vgpr_vo ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll index eef6bb7b0788f..f001bf97fcd9e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f32.ll @@ -456,6 +456,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__vgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s6, v3 ; GFX12-NEXT: v_readfirstlane_b32 s7, v4 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] ; GFX12-NEXT: v_cmp_eq_u64_e64 s1, s[6:7], v[3:4] ; GFX12-NEXT: s_and_b32 s1, vcc_lo, s1 @@ -610,6 +611,7 @@ define float @struct_ptr_buffer_atomic_fmin_f32_ret__vgpr_val__sgpr_rsrc__vgpr_v ; GFX12-NEXT: v_readfirstlane_b32 s3, v7 ; GFX12-NEXT: s_wait_alu 0xf1ff ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4] ; GFX12-NEXT: v_cmp_eq_u32_e64 s1, s3, v7 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index 2da07e51781e6..e6cc8f9f1f3cf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -2077,6 +2077,7 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2112,6 +2113,7 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) { ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2153,6 +2155,7 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2188,6 +2191,7 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1) ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2229,6 +2233,7 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2264,6 +2269,7 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2305,6 +2311,7 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2340,6 +2347,7 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) { ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b16 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2381,6 +2389,7 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2416,6 +2425,7 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-GISEL-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2462,6 +2472,7 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 ; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off @@ -2503,6 +2514,7 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1100-GISEL-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_writelane_b32 v5, s0, s1 ; GFX1100-GISEL-NEXT: v_writelane_b32 v6, s2, s1 ; GFX1100-GISEL-NEXT: global_store_b64 v[0:1], v[5:6], off @@ -2586,6 +2598,7 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v7 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v6 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v16, s0, s1 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX1100-SDAG-NEXT: v_writelane_b32 v13, s4, s1 @@ -2981,6 +2994,7 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll index 11cdc625d9adb..40e124382df95 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ptr.ll @@ -40,6 +40,7 @@ define void @test_writelane_p0(ptr addrspace(1) %out, ptr %src, i32 %src1) { ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v4 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v5, s2, s1 ; GFX1100-SDAG-NEXT: global_store_b64 v[0:1], v[5:6], off @@ -118,6 +119,7 @@ define void @test_writelane_v3p0(ptr addrspace(1) %out, <3 x ptr> %src, i32 %src ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v7 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v6 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(1) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_writelane_b32 v14, s0, s1 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX1100-SDAG-NEXT: v_writelane_b32 v12, s3, s1 @@ -167,6 +169,7 @@ define void @test_writelane_p3(ptr addrspace(1) %out, ptr addrspace(3) %src, i32 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -218,8 +221,10 @@ define void @test_writelane_v3p3(ptr addrspace(1) %out, <3 x ptr addrspace(3)> % ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 ; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -261,6 +266,7 @@ define void @test_writelane_p5(ptr addrspace(1) %out, ptr addrspace(5) %src, i32 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -312,8 +318,10 @@ define void @test_writelane_v3p5(ptr addrspace(1) %out, <3 x ptr addrspace(5)> % ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 ; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -355,6 +363,7 @@ define void @test_writelane_p6(ptr addrspace(1) %out, ptr addrspace(6) %src, i32 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s0, v2 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s1, v3 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_writelane_b32 v4, s0, s1 ; GFX1100-SDAG-NEXT: global_store_b32 v[0:1], v4, off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -406,8 +415,10 @@ define void @test_writelane_v3p6(ptr addrspace(1) %out, <3 x ptr addrspace(6)> % ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s2, v3 ; GFX1100-SDAG-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1100-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v8, s0, s1 ; GFX1100-SDAG-NEXT: v_writelane_b32 v7, s2, s1 +; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_writelane_b32 v6, s3, s1 ; GFX1100-SDAG-NEXT: global_store_b96 v[0:1], v[6:8], off ; GFX1100-SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index c8f33f1464d47..66cf8a309ccf0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -110,15 +110,17 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v1, v5 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add3_u32 v1, v1, v5, v7 ; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v7 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v6, v8, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd @@ -286,20 +288,22 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 ; GFX12-NEXT: v_mad_co_i64_i32 v[10:11], null, v5, v3, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v1, v6 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v9, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_subrev_co_ci_u32_e64 v10, null, 0, v9, vcc_lo @@ -311,12 +315,13 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_subrev_co_ci_u32_e64 v7, null, 0, v5, vcc_lo ; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 ; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index ae4acfe35d08e..a42c71c4849bd 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -8314,8 +8314,8 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] @@ -9176,8 +9176,8 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 28504da5a6833..8351d28057564 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -7666,6 +7666,7 @@ define void @local_atomic_fmax_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 @@ -8128,6 +8129,7 @@ define void @local_atomic_fmax_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 48714b7282b1e..0c4aca88b3781 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -7666,6 +7666,7 @@ define void @local_atomic_fmin_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 @@ -8128,6 +8129,7 @@ define void @local_atomic_fmin_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 6879a7cfd09c2..37310b614c0db 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -8438,6 +8438,7 @@ define void @local_atomic_fsub_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 @@ -8900,6 +8901,7 @@ define void @local_atomic_fsub_noret_v2bf16__ofset(ptr addrspace(3) %ptr, <2 x b ; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v4, v6, v8, s0 diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 117f359be0c3b..66df769a6bebb 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -382,6 +382,7 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v13, v14, v[8:9] ; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v9, null, v9, v1, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 ; GFX12-NEXT: s_wait_alu 0xfffd @@ -1157,7 +1158,7 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo @@ -1247,10 +1248,11 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] @@ -1794,9 +1796,11 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { ; GFX12-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 ; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1 ; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] %op = add i64 %arg0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll index 7d9b46a10c8f1..6246f2fd4fa5d 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll @@ -772,11 +772,11 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo @@ -788,7 +788,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo @@ -1109,7 +1109,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo @@ -13276,6 +13276,7 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 @@ -13296,16 +13297,17 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v50.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v52, v53 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -13835,9 +13837,9 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v34 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s3, v66, v66 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v68, v68, v67, s3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v68 @@ -15089,11 +15091,11 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo @@ -15105,7 +15107,7 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll index b97239081ac77..678d0a432a44f 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll @@ -781,11 +781,11 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo @@ -797,7 +797,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo @@ -1121,7 +1121,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo @@ -13331,6 +13331,7 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 @@ -13351,16 +13352,17 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v50.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v52, v53 ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) @@ -13890,9 +13892,9 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v34 ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e64 s3, v66, v66 ; GFX12-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v68, v68, v67, s3 ; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v67, v67, v68, vcc_lo ; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v81, v81 ; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v65, 16, v68 @@ -15149,11 +15151,11 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo @@ -15165,7 +15167,7 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index c4842c1f4f523..bbfd5f49981b6 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -277,24 +277,27 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v5, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v7, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 @@ -879,10 +882,11 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff ; GFX11-NEXT: s_waitcnt vmcnt(10) ; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v10, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2 @@ -892,24 +896,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v3, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v20, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v18, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v16, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v22, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v24, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v26, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2 @@ -1528,10 +1535,11 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 @@ -2317,27 +2325,30 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:2048 ; GFX11-NEXT: s_waitcnt vmcnt(6) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v7, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v5, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v12, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v10, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, vcc_lo ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v14, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 653d4b85a9a5b..32f2395c7b2ad 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -70,10 +70,11 @@ define ptr @gep_as0(ptr %p, i64 %offset) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 5 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo @@ -232,10 +233,11 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) { ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, 5 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll index 39428dc448018..1b7112ef4ddca 100644 --- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll @@ -916,6 +916,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -984,6 +985,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1052,6 +1054,7 @@ define <2 x half> @v_test_fmin_legacy_ule_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1156,6 +1159,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_safe(<2 x half> %a, <2 x half> % ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1224,6 +1228,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nnan_flag(<2 x half> %a, <2 x ha ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1292,6 +1297,7 @@ define <2 x half> @v_test_fmax_legacy_uge_v2f16_nsz_flag(<2 x half> %a, <2 x hal ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll index dfa50ce55a521..30ed6ae5484c6 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll @@ -2502,6 +2502,7 @@ define i64 @test_vector_reduce_add_v3i64(<3 x i64> %v) { ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo @@ -2639,6 +2640,7 @@ define i64 @test_vector_reduce_add_v4i64(<4 x i64> %v) { ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -2657,6 +2659,7 @@ define i64 @test_vector_reduce_add_v4i64(<4 x i64> %v) { ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -2892,6 +2895,7 @@ define i64 @test_vector_reduce_add_v8i64(<8 x i64> %v) { ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -2922,6 +2926,7 @@ define i64 @test_vector_reduce_add_v8i64(<8 x i64> %v) { ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -3371,6 +3376,7 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo @@ -3427,6 +3433,7 @@ define i64 @test_vector_reduce_add_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, vcc_lo +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index 16fbd1eabb305..f0829b53168d9 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3542,9 +3542,9 @@ define i64 @test_vector_reduce_smax_v4i64(<4 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3562,9 +3562,9 @@ define i64 @test_vector_reduce_smax_v4i64(<4 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3873,9 +3873,9 @@ define i64 @test_vector_reduce_smax_v8i64(<8 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3907,9 +3907,9 @@ define i64 @test_vector_reduce_smax_v8i64(<8 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4476,11 +4476,12 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -4488,7 +4489,7 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 @@ -4499,11 +4500,11 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4545,7 +4546,7 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e64 s1, v[4:5], v[12:13] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s1 @@ -4553,25 +4554,25 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e64 s0, v[14:15], v[30:31] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v14, v30, v14, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v15, v31, v15, s0 ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[6:7], v[14:15] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7 ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index bb868621c23d7..e67420562e257 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3542,9 +3542,9 @@ define i64 @test_vector_reduce_smin_v4i64(<4 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3562,9 +3562,9 @@ define i64 @test_vector_reduce_smin_v4i64(<4 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3873,9 +3873,9 @@ define i64 @test_vector_reduce_smin_v8i64(<8 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3907,9 +3907,9 @@ define i64 @test_vector_reduce_smin_v8i64(<8 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4476,11 +4476,12 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -4488,7 +4489,7 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 @@ -4499,11 +4500,11 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4545,7 +4546,7 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e64 s1, v[4:5], v[12:13] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s1 @@ -4553,25 +4554,25 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e64 s0, v[14:15], v[30:31] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v14, v30, v14, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v15, v31, v15, s0 ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[14:15] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7 ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 2eeedd4cfffba..92993d07b4f8f 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3423,9 +3423,9 @@ define i64 @test_vector_reduce_umax_v4i64(<4 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3443,9 +3443,9 @@ define i64 @test_vector_reduce_umax_v4i64(<4 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3754,9 +3754,9 @@ define i64 @test_vector_reduce_umax_v8i64(<8 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3788,9 +3788,9 @@ define i64 @test_vector_reduce_umax_v8i64(<8 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4357,11 +4357,12 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -4369,7 +4370,7 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 @@ -4380,11 +4381,11 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4426,7 +4427,7 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e64 s1, v[4:5], v[12:13] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s1 @@ -4434,25 +4435,25 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[14:15], v[30:31] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v14, v30, v14, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v15, v31, v15, s0 ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7 ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 6e4a06b3f8f4e..2bcee373d9247 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3162,9 +3162,9 @@ define i64 @test_vector_reduce_umin_v4i64(<4 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3182,9 +3182,9 @@ define i64 @test_vector_reduce_umin_v4i64(<4 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3493,9 +3493,9 @@ define i64 @test_vector_reduce_umin_v8i64(<8 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -3527,9 +3527,9 @@ define i64 @test_vector_reduce_umin_v8i64(<8 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4096,11 +4096,12 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -4108,7 +4109,7 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 @@ -4119,11 +4120,11 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[4:5] ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 @@ -4165,7 +4166,7 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e64 s1, v[4:5], v[12:13] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s1 @@ -4173,25 +4174,25 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e64 s0, v[14:15], v[30:31] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v14, v30, v14, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v15, v31, v15, s0 ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[6:7], v[14:15] ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v6, v14, v6 :: v_dual_cndmask_b32 v7, v15, v7 ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX12-GISEL-NEXT: s_wait_alu 0xf1ff +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1