diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 2e7f25b67fb63..a82b15d07d0ad 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -9,6 +9,12 @@ def BITOP3_32 : ComplexPattern; def BITOP3_16 : ComplexPattern; +// Matches PTRADD as a commutative operation. Patterns using this PatFrag must +// set GISelShouldIgnore = 1 as commuting the corresponding G_PTR_ADD is +// invalid. +def ptradd_commutative : PatFrags<(ops node:$src0, node:$src1), + [(ptradd node:$src0, node:$src1), (ptradd node:$src1, node:$src0)]>; + // Special case for v_div_fmas_{f32|f64}, since it seems to be the // only VOP instruction that implicitly reads VCC. let Asm64 = " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod" in { @@ -836,12 +842,20 @@ def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; -let SubtargetPredicate = HasLshlAddU64Inst in +let SubtargetPredicate = HasLshlAddU64Inst in { def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +def : GCNPat < + // (ptradd z, (shl x, y)) or (ptradd (shl x, y), z) -> ((x << y) + z) + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2)> { + let GISelShouldIgnore = 1; +} +} // End SubtargetPredicate = HasLshlAddU64Inst + def : VOPBinOpClampPat; def : VOPBinOpClampPat; @@ -910,19 +924,26 @@ multiclass IMAD32_Pats { // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. // We need to separate this because otherwise OtherPredicates would be overriden. -class IMAD32_Mul24_Pat: GCNPat < - (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), +class IMAD32_Mul24_Pats_Impl : GCNPat < + (i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), (inst $src0, $src1, $src2, 0 /* clamp */) >; +multiclass IMAD32_Mul24_Pats { + def : IMAD32_Mul24_Pats_Impl; + def : IMAD32_Mul24_Pats_Impl { + let GISelShouldIgnore = 1; + } +} + // exclude pre-GFX9 where it was slow let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d48bfe0bb7f21..34bb98550de04 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -266,18 +266,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { ; Use non-zero shift amounts in v_lshl_add_u64. define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: select_v_lshl_add_u64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr %base, i64 %voffset ret ptr %gep } @@ -285,23 +278,13 @@ define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { ; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the ; mul into a mul24. define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { -; GFX942_PTRADD-LABEL: fold_mul24_into_mad: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_and_b32_e32 v2, 0xfffff, v2 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v4, 0xfffff, v4 -; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v4 -; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: fold_mul24_into_mad: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_and_b32_e32 v2, 0xfffff, v2 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v3, 0xfffff, v4 -; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: fold_mul24_into_mad: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v2, 0xfffff, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xfffff, v4 +; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %a_masked = and i64 %a, u0xfffff %b_masked = and i64 %b, u0xfffff %mul = mul i64 %a_masked, %b_masked diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 32f2395c7b2ad..9dd25025d4381 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -25,20 +25,12 @@ define ptr @gep_as0(ptr %p, i64 %offset) { ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX942_PTRADD-LABEL: gep_as0: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: gep_as0: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: gep_as0: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: gep_as0: ; GFX10: ; %bb.0: ; %entry @@ -188,20 +180,12 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) { ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX942_PTRADD-LABEL: multi_gep_as0: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: multi_gep_as0: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: multi_gep_as0: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: multi_gep_as0: ; GFX10: ; %bb.0: ; %entry @@ -537,3 +521,5 @@ entry: ; GFX12_PTRADD: {{.*}} ; GFX8_LEGACY: {{.*}} ; GFX8_PTRADD: {{.*}} +; GFX942_LEGACY: {{.*}} +; GFX942_PTRADD: {{.*}}