-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU][SDAG] Handle ISD::PTRADD in various special cases #145330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/ritter-x2a/06-17-_amdgpu_sdag_test_isd_ptradd_handling_in_various_special_cases
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) ChangesThere are more places in SIISelLowering.cpp and AMDGPUISelDAGToDAG.cpp For SWDEV-516125. Patch is 21.78 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145330.diff 6 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0f5a943d663d7..06953bdb31ea4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8219,7 +8219,7 @@ static bool isMemSrcFromConstant(SDValue Src, ConstantDataArraySlice &Slice) {
GlobalAddressSDNode *G = nullptr;
if (Src.getOpcode() == ISD::GlobalAddress)
G = cast<GlobalAddressSDNode>(Src);
- else if (Src.getOpcode() == ISD::ADD &&
+ else if (Src->isAnyAdd() &&
Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
Src.getOperand(1).getOpcode() == ISD::Constant) {
G = cast<GlobalAddressSDNode>(Src.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66717135c9adf..63ca47bb119e5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -615,8 +615,14 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth,
// operands on the new node are also disjoint.
SDNodeFlags Flags(Op->getFlags().hasDisjoint() ? SDNodeFlags::Disjoint
: SDNodeFlags::None);
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::PTRADD) {
+ // It isn't a ptradd anymore if it doesn't operate on the entire
+ // pointer.
+ Opcode = ISD::ADD;
+ }
SDValue X = DAG.getNode(
- Op.getOpcode(), dl, SmallVT,
+ Opcode, dl, SmallVT,
DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)),
DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1)), Flags);
assert(DemandedSize <= SmallVTBits && "Narrowed below demanded bits?");
@@ -2851,6 +2857,11 @@ bool TargetLowering::SimplifyDemandedBits(
return TLO.CombineTo(Op, And1);
}
[[fallthrough]];
+ case ISD::PTRADD:
+ if (Op.getOperand(0).getValueType() != Op.getOperand(1).getValueType())
+ break;
+ // PTRADD behaves like ADD if pointers are represented as integers.
+ [[fallthrough]];
case ISD::ADD:
case ISD::SUB: {
// Add, Sub, and Mul don't demand any bits in positions beyond that
@@ -2960,10 +2971,10 @@ bool TargetLowering::SimplifyDemandedBits(
if (Op.getOpcode() == ISD::MUL) {
Known = KnownBits::mul(KnownOp0, KnownOp1);
- } else { // Op.getOpcode() is either ISD::ADD or ISD::SUB.
+ } else { // Op.getOpcode() is either ISD::ADD, ISD::PTRADD, or ISD::SUB.
Known = KnownBits::computeForAddSub(
- Op.getOpcode() == ISD::ADD, Flags.hasNoSignedWrap(),
- Flags.hasNoUnsignedWrap(), KnownOp0, KnownOp1);
+ Op->isAnyAdd(), Flags.hasNoSignedWrap(), Flags.hasNoUnsignedWrap(),
+ KnownOp0, KnownOp1);
}
break;
}
@@ -5593,7 +5604,7 @@ bool TargetLowering::isGAPlusOffset(SDNode *WN, const GlobalValue *&GA,
return true;
}
- if (N->getOpcode() == ISD::ADD) {
+ if (N->isAnyAdd()) {
SDValue N1 = N->getOperand(0);
SDValue N2 = N->getOperand(1);
if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6e990cb2e160c..ee73ad5dda945 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1449,7 +1449,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
C1 = nullptr;
}
- if (N0.getOpcode() == ISD::ADD) {
+ if (N0->isAnyAdd()) {
// (add N2, N3) -> addr64, or
// (add (add N2, N3), C1) -> addr64
SDValue N2 = N0.getOperand(0);
@@ -1899,7 +1899,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
// Match the variable offset.
- if (Addr.getOpcode() == ISD::ADD) {
+ if (Addr->isAnyAdd()) {
LHS = Addr.getOperand(0);
RHS = Addr.getOperand(1);
@@ -2230,7 +2230,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
SDValue N0, N1;
// Extract the base and offset if possible.
- if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) {
+ if (CurDAG->isBaseWithConstantOffset(Addr) || Addr->isAnyAdd()) {
N0 = Addr.getOperand(0);
N1 = Addr.getOperand(1);
} else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ec57d231dab5d..029ea2370e18d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10488,7 +10488,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue VOffset;
// Try to split SAddr and VOffset. Global and LDS pointers share the same
// immediate offset, so we cannot use a regular SelectGlobalSAddr().
- if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
+ if (Addr->isDivergent() && Addr->isAnyAdd()) {
SDValue LHS = Addr.getOperand(0);
SDValue RHS = Addr.getOperand(1);
@@ -12038,8 +12038,7 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
// We only do this to handle cases where it's profitable when there are
// multiple uses of the add, so defer to the standard combine.
- if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
- N0->hasOneUse())
+ if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
return SDValue();
const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
@@ -12078,6 +12077,8 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
N->getFlags().hasNoUnsignedWrap() &&
(N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
+ // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
+ // be sure that the new left operand is a proper base pointer.
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
index fab56383ffa8a..ff90f1f175c3c 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
@@ -5,50 +5,26 @@
; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF.
define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GFX6_PTRADD-LABEL: v_add_i32:
-; GFX6_PTRADD: ; %bb.0:
-; GFX6_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX6_PTRADD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX6_PTRADD-NEXT: s_mov_b32 s7, 0x100f000
-; GFX6_PTRADD-NEXT: s_mov_b32 s10, 0
-; GFX6_PTRADD-NEXT: s_mov_b32 s11, s7
-; GFX6_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6_PTRADD-NEXT: v_mov_b32_e32 v1, s3
-; GFX6_PTRADD-NEXT: v_add_i32_e32 v0, vcc, s2, v0
-; GFX6_PTRADD-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6_PTRADD-NEXT: s_mov_b32 s8, s10
-; GFX6_PTRADD-NEXT: s_mov_b32 s9, s10
-; GFX6_PTRADD-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GFX6_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX6_PTRADD-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GFX6_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX6_PTRADD-NEXT: s_mov_b32 s6, -1
-; GFX6_PTRADD-NEXT: s_mov_b32 s4, s0
-; GFX6_PTRADD-NEXT: s_mov_b32 s5, s1
-; GFX6_PTRADD-NEXT: v_add_i32_e32 v0, vcc, v2, v0
-; GFX6_PTRADD-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX6_PTRADD-NEXT: s_endpgm
-;
-; GFX6_LEGACY-LABEL: v_add_i32:
-; GFX6_LEGACY: ; %bb.0:
-; GFX6_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX6_LEGACY-NEXT: s_mov_b32 s7, 0x100f000
-; GFX6_LEGACY-NEXT: s_mov_b32 s10, 0
-; GFX6_LEGACY-NEXT: s_mov_b32 s11, s7
-; GFX6_LEGACY-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX6_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6_LEGACY-NEXT: s_mov_b64 s[8:9], s[2:3]
-; GFX6_LEGACY-NEXT: v_mov_b32_e32 v1, 0
-; GFX6_LEGACY-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GFX6_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX6_LEGACY-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GFX6_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX6_LEGACY-NEXT: s_mov_b32 s6, -1
-; GFX6_LEGACY-NEXT: s_mov_b32 s4, s0
-; GFX6_LEGACY-NEXT: s_mov_b32 s5, s1
-; GFX6_LEGACY-NEXT: v_add_i32_e32 v0, vcc, v2, v0
-; GFX6_LEGACY-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX6_LEGACY-NEXT: s_endpgm
+; GFX6-LABEL: v_add_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX6-NEXT: s_mov_b32 s7, 0x100f000
+; GFX6-NEXT: s_mov_b32 s10, 0
+; GFX6-NEXT: s_mov_b32 s11, s7
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_mov_b64 s[8:9], s[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v1, 0
+; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, -1
+; GFX6-NEXT: s_mov_b32 s4, s0
+; GFX6-NEXT: s_mov_b32 s5, s1
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
%b_ptr = getelementptr i32, ptr addrspace(1) %gep, i32 1
@@ -60,4 +36,5 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX6: {{.*}}
+; GFX6_LEGACY: {{.*}}
+; GFX6_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 0cd920616c515..893deb35fe822 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -294,27 +294,15 @@ define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) {
; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectGlobalSAddr.
define amdgpu_kernel void @uniform_base_varying_offset_imm(ptr addrspace(1) %p) {
-; GFX942_PTRADD-LABEL: uniform_base_varying_offset_imm:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off offset:16
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: uniform_base_varying_offset_imm:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 1
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[0:1] offset:16
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: uniform_base_varying_offset_imm:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v0, v1, s[0:1] offset:16
+; GFX942-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%shift = shl i32 %tid, 2
@@ -328,33 +316,18 @@ entry:
; Adjusted from global-saddr-load.ll. Tests PTRADD handling in
; AMDGPUDAGToDAGISel::SelectSMRDBaseOffset.
define amdgpu_kernel void @global_load_saddr_i32_uniform_offset(ptr addrspace(1) %sbase, i32 %soffset, ptr addrspace(1) %r) {
-; GFX942_PTRADD-LABEL: global_load_saddr_i32_uniform_offset:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: s_load_dword s6, s[4:5], 0x8
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, s6
-; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, 0
-; GFX942_PTRADD-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0
-; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: global_load_saddr_i32_uniform_offset:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_load_dword s6, s[4:5], 0x8
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_load_dword s0, s[0:1], s6 offset:0x0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0
-; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[2:3]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: global_load_saddr_i32_uniform_offset:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x8
+; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_load_dword s0, s[0:1], s6 offset:0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT: s_endpgm
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%load = load i32, ptr addrspace(1) %gep0
@@ -366,28 +339,15 @@ define amdgpu_kernel void @global_load_saddr_i32_uniform_offset(ptr addrspace(1)
; Adjusted from llvm.amdgcn.global.load.lds.ll, tests the offset lowering for
; Intrinsic::amdgcn_global_load_lds.
define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture inreg %gptr, ptr addrspace(3) nocapture %lptr, i32 %voffset) {
-; GFX942_PTRADD-LABEL: global_load_lds_dword_saddr_and_vaddr:
-; GFX942_PTRADD: ; %bb.0: ; %main_body
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, v1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, 0
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: v_readfirstlane_b32 s0, v0
-; GFX942_PTRADD-NEXT: s_mov_b32 m0, s0
-; GFX942_PTRADD-NEXT: s_nop 0
-; GFX942_PTRADD-NEXT: global_load_lds_dword v[2:3], off offset:48 sc1
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: global_load_lds_dword_saddr_and_vaddr:
-; GFX942_LEGACY: ; %bb.0: ; %main_body
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_readfirstlane_b32 s2, v0
-; GFX942_LEGACY-NEXT: s_mov_b32 m0, s2
-; GFX942_LEGACY-NEXT: s_nop 0
-; GFX942_LEGACY-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: global_load_lds_dword_saddr_and_vaddr:
+; GFX942: ; %bb.0: ; %main_body
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_readfirstlane_b32 s2, v0
+; GFX942-NEXT: s_mov_b32 m0, s2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%voffset.64 = zext i32 %voffset to i64
%gep = getelementptr i8, ptr addrspace(1) %gptr, i64 %voffset.64
@@ -398,29 +358,17 @@ main_body:
; Taken from shl_add_ptr_global.ll, tests PTRADD handling in
; SITargetLowering::performSHLPtrCombine.
define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr addrspace(1) %extra.use, ptr addrspace(1) %ptr) {
-; GFX942_PTRADD-LABEL: shl_base_global_ptr_global_atomic_fadd:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_mov_b64 s[0:1], 0x80
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, s[0:1]
-; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1]
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v6, 0x42c80000
-; GFX942_PTRADD-NEXT: global_atomic_add_f32 v[4:5], v6, off
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v[2:3], v[0:1], off sc0 sc1
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: shl_base_global_ptr_global_atomic_fadd:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[0:1], 2, v[4:5]
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v6, 0x42c80000
-; GFX942_LEGACY-NEXT: global_atomic_add_f32 v[0:1], v6, off offset:512
-; GFX942_LEGACY-NEXT: s_mov_b64 s[0:1], 0x80
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, s[0:1]
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v[2:3], v[0:1], off sc0 sc1
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: shl_base_global_ptr_global_atomic_fadd:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshlrev_b64 v[0:1], 2, v[4:5]
+; GFX942-NEXT: v_mov_b32_e32 v6, 0x42c80000
+; GFX942-NEXT: global_atomic_add_f32 v[0:1], v6, off offset:512
+; GFX942-NEXT: s_mov_b64 s[0:1], 0x80
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, s[0:1]
+; GFX942-NEXT: global_store_dwordx2 v[2:3], v[0:1], off sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%arrayidx0 = getelementptr inbounds [512 x i32], ptr addrspace(1) %ptr, i64 0, i64 32
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
@@ -433,27 +381,16 @@ define void @shl_base_global_ptr_global_atomic_fadd(ptr addrspace(1) %out, ptr a
; Test PTRADD handling in TargetLowering::SimplifyDemandedBits and
; TargetLowering::ShrinkDemandedOp.
define i32 @gep_in_const_as_cast_to_const32_as(ptr addrspace(4) %src, i64 %offset) {
-; GFX942_PTRADD-LABEL: gep_in_const_as_cast_to_const32_as:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: s_mov_b32 s1, 0
-; GFX942_PTRADD-NEXT: v_readfirstlane_b32 s0, v0
-; GFX942_PTRADD-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s0
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: gep_in_const_as_cast_to_const32_as:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_add_u32_e32 v0, v0, v2
-; GFX942_LEGACY-NEXT: s_mov_b32 s1, 0
-; GFX942_LEGACY-NEXT: v_readfirstlane_b32 s0, v0
-; GFX942_LEGACY-NEXT: s_load_dword s0, s[0:1], 0x0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s0
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: gep_in_const_as_cast_to_const32_as:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX942-NEXT: s_mov_b32 s1, 0
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: s_load_dword s0, s[0:1], 0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, s0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
entry:
%gep = getelementptr i8, ptr addrspace(4) %src, i64 %offset
%gep.cast = addrspacecast ptr addrspace(4) %gep to ptr addrspace(6)
@@ -465,29 +402,14 @@ entry:
; Test PTRADD handling in isMemSrcFromConstant.
defi...
[truncated]
|
53a1700
to
7c9ac3c
Compare
0d60002
to
286252e
Compare
7c9ac3c
to
2fb216f
Compare
286252e
to
e823a0d
Compare
// It isn't a ptradd anymore if it doesn't operate on the entire | ||
// pointer. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a bit surprising but I suppose it's a consequence of not actually having pointer types, which would imply an additional operation is needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You could make a PTRADD here work mechanically for AMDGPU by adding support of 32-bit PTRADDs (which requires more DAGCombines and is not on the critical path for me at the moment), but I'd still argue that this shouldn't be a PTRADD even then. As I see it, the point of having the PTRADD node is being able to tell that the left operand is an address, which it isn't if you chop off the leading bits (unless the chopping is done to implement an address space cast).
Known = KnownBits::computeForAddSub( | ||
Op.getOpcode() == ISD::ADD, Flags.hasNoSignedWrap(), | ||
Flags.hasNoUnsignedWrap(), KnownOp0, KnownOp1); | ||
Op->isAnyAdd(), Flags.hasNoSignedWrap(), Flags.hasNoUnsignedWrap(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Op->isAnyAdd(), Flags.hasNoSignedWrap(), Flags.hasNoUnsignedWrap(), | |
Op.getOpcode() != ISD::SUB, Flags.hasNoSignedWrap(), Flags.hasNoUnsignedWrap(), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -2230,7 +2230,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, | |||
|
|||
SDValue N0, N1; | |||
// Extract the base and offset if possible. | |||
if (CurDAG->isBaseWithConstantOffset(Addr) || Addr.getOpcode() == ISD::ADD) { | |||
if (CurDAG->isBaseWithConstantOffset(Addr) || Addr->isAnyAdd()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The existing code doesn't make sense to me, why is this checking isBaseWithConstantOffset if it isn't parsing out the constant offset? This is just redundant with the isAnyAdd?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the code here isn't interested in the Constant
part of isBaseWithConstantOffset
; that's checked in SelectSMRDOffset
. It should be fine (even better, in some hypothetical cases) to replace this call with an isADDLike
call (which catches disjoint ors and an xor special case whereas isAnyAdd
only checks for ISD::ADD
or ISD::PTRADD
). I'll try that and update the PR.
Ultimately, it would probably be nice to select offsets for the different addressing mode variants more consistently, but I don't see that in the scope of this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, didn't change the behavior in any lit test.
There are more places in SIISelLowering.cpp and AMDGPUISelDAGToDAG.cpp that check for ISD::ADD in a pointer context, but as far as I can tell those are only relevant for 32-bit pointer arithmetic (like frame indices/scratch addresses and LDS), for which we don't enable PTRADD generation yet. For SWDEV-516125.
Op->isAnyAdd() -> Op.getOpcode() != ISD::SUB
e823a0d
to
bd98fa8
Compare
2fb216f
to
ee350dc
Compare
There are more places in SIISelLowering.cpp and AMDGPUISelDAGToDAG.cpp
that check for ISD::ADD in a pointer context, but as far as I can tell
those are only relevant for 32-bit pointer arithmetic (like frame
indices/scratch addresses and LDS), for which we don't enable PTRADD
generation yet.
For SWDEV-516125.