diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 279de32a9cee8..4548beadf23ae 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -512,12 +512,13 @@ let OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue defm: Ternary_i16_Pats_gfx9; } // End OtherPredicates = [isGFX10Plus, Has16BitInsts], True16Predicate = NotHasTrue16BitInsts -class ThreeOpFragSDAG : PatFrag< +class ThreeOpFragSDAG : PatFrag< (ops node:$x, node:$y, node:$z), // When the inner operation is used multiple times, selecting 3-op // instructions may still be beneficial -- if the other users can be // combined similarly. Let's be conservative for now. - (op2 (HasOneUseBinOp node:$x, node:$y), node:$z), + !if(op1IsRight, (op2 node:$z, (HasOneUseBinOp node:$x, node:$y)), + (op2 (HasOneUseBinOp node:$x, node:$y), node:$z)), [{ // Only use VALU ops when the result is divergent. if (!N->isDivergent()) @@ -544,7 +545,10 @@ class ThreeOpFragSDAG : PatFrag< let PredicateCodeUsesOperands = 1; } -class ThreeOpFrag : ThreeOpFragSDAG { +// Matches (op2 (op1 x, y), z) if op1IsRight = 0 and +// matches (op2 z, (op1, x, y)) if op1IsRight = 1. +class ThreeOpFrag : ThreeOpFragSDAG { // The divergence predicate is irrelevant in GlobalISel, as we have // proper register bank checks. We just need to verify the constant // bus restriction when all the sources are considered. @@ -834,12 +838,19 @@ def : GCNPat< (DivergentBinFrag i32:$src0, IsPow2Plus1:$src1), (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>; -let SubtargetPredicate = HasLshlAddU64Inst in +let SubtargetPredicate = HasLshlAddU64Inst in { def : GCNPat< (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) >; +def : GCNPat < + // (ptradd z, (shl x, y)) -> ((x << y) + z) + (ThreeOpFrag i64:$src0, i32:$src1, i64:$src2), + (V_LSHL_ADD_U64_e64 VSrc_b64:$src0, VSrc_b32:$src1, VSrc_b64:$src2) +>; +} // End SubtargetPredicate = HasLshlAddU64Inst + def : VOPBinOpClampPat; def : VOPBinOpClampPat; @@ -908,19 +919,24 @@ multiclass IMAD32_Pats { // Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul. // We need to separate this because otherwise OtherPredicates would be overriden. -class IMAD32_Mul24_Pat: GCNPat < - (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)), - (inst $src0, $src1, $src2, 0 /* clamp */) - >; +class IMAD32_Mul24_Pats_Impl : GCNPat < + !if(mulIsRight, (i64 (AddOp i64:$src2, (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)))), + (i64 (AddOp (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2))), + (inst $src0, $src1, $src2, 0 /* clamp */)>; + +multiclass IMAD32_Mul24_Pats { + def : IMAD32_Mul24_Pats_Impl; + def : IMAD32_Mul24_Pats_Impl; +} // exclude pre-GFX9 where it was slow let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in { defm : IMAD32_Pats; - def : IMAD32_Mul24_Pat; + defm : IMAD32_Mul24_Pats; } def VOP3_PERMLANE_Profile : VOP3_Profile, VOP3_OPSEL> { diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index d48bfe0bb7f21..34bb98550de04 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -266,18 +266,11 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { ; Use non-zero shift amounts in v_lshl_add_u64. define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: select_v_lshl_add_u64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i64, ptr %base, i64 %voffset ret ptr %gep } @@ -285,23 +278,13 @@ define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { ; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the ; mul into a mul24. define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { -; GFX942_PTRADD-LABEL: fold_mul24_into_mad: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_and_b32_e32 v2, 0xfffff, v2 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v4, 0xfffff, v4 -; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v4 -; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v2, v2, v4 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: fold_mul24_into_mad: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_and_b32_e32 v2, 0xfffff, v2 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v3, 0xfffff, v4 -; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: fold_mul24_into_mad: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_and_b32_e32 v2, 0xfffff, v2 +; GFX942-NEXT: v_and_b32_e32 v3, 0xfffff, v4 +; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %a_masked = and i64 %a, u0xfffff %b_masked = and i64 %b, u0xfffff %mul = mul i64 %a_masked, %b_masked diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 653d4b85a9a5b..1c4a9547ed189 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -25,20 +25,12 @@ define ptr @gep_as0(ptr %p, i64 %offset) { ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX942_PTRADD-LABEL: gep_as0: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: gep_as0: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: gep_as0: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: gep_as0: ; GFX10: ; %bb.0: ; %entry @@ -187,20 +179,12 @@ define ptr @multi_gep_as0(ptr %p, i64 %offset) { ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX942_PTRADD-LABEL: multi_gep_as0: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: multi_gep_as0: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: multi_gep_as0: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 5 +; GFX942-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: multi_gep_as0: ; GFX10: ; %bb.0: ; %entry @@ -535,3 +519,5 @@ entry: ; GFX12_PTRADD: {{.*}} ; GFX8_LEGACY: {{.*}} ; GFX8_PTRADD: {{.*}} +; GFX942_LEGACY: {{.*}} +; GFX942_PTRADD: {{.*}}