-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Tail call support for whole wave functions #145860
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/rovka/whole-wave-funcs-call
Are you sure you want to change the base?
[AMDGPU] Tail call support for whole wave functions #145860
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Diana Picus (rovka) ChangesSupport tail calls to whole wave functions (trivial) and from whole wave Move the expansion of whole wave function return pseudos (regular and Unnecessary register spills will be dealt with in a future patch. Patch is 154.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145860.diff 15 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2310d511b1df8..0e27b0f764795 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7977,6 +7977,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
case Intrinsic::amdgcn_call_whole_wave: {
TargetLowering::ArgListTy Args;
+ bool isTailCall = I.isTailCall();
// The first argument is the callee. Skip it when assembling the call args.
TargetLowering::ArgListEntry Arg;
@@ -7984,6 +7985,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
Arg.Node = getValue(I.getArgOperand(Idx));
Arg.Ty = I.getArgOperand(Idx)->getType();
Arg.setAttributes(&I, Idx);
+
+ if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx)))
+ isTailCall = false;
+
Args.push_back(Arg);
}
@@ -7998,7 +8003,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
.setChain(getRoot())
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
getValue(I.getArgOperand(0)), std::move(Args))
- .setTailCall(false)
+ .setTailCall(isTailCall && canTailCall(I))
.setIsPreallocated(
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
.setConvergent(I.isConvergent())
@@ -8879,6 +8884,29 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
return Result;
}
+bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
+ bool isMustTailCall = CB.isMustTailCall();
+
+ // Avoid emitting tail calls in functions with the disable-tail-calls
+ // attribute.
+ auto *Caller = CB.getParent()->getParent();
+ if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
+ "true" &&
+ !isMustTailCall)
+ return false;
+
+ // We can't tail call inside a function with a swifterror argument. Lowering
+ // does not support this yet. It would have to move into the swifterror
+ // register before the call.
+ if (DAG.getTargetLoweringInfo().supportSwiftError() &&
+ Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ // Check if target-independent constraints permit a tail call here.
+ // Target-dependent constraints are checked within TLI->LowerCallTo.
+ return isInTailCallPosition(CB, DAG.getTarget());
+}
+
void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
bool isTailCall, bool isMustTailCall,
const BasicBlock *EHPadBB,
@@ -8893,21 +8921,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
const Value *SwiftErrorVal = nullptr;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (isTailCall) {
- // Avoid emitting tail calls in functions with the disable-tail-calls
- // attribute.
- auto *Caller = CB.getParent()->getParent();
- if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
- "true" && !isMustTailCall)
- isTailCall = false;
-
- // We can't tail call inside a function with a swifterror argument. Lowering
- // does not support this yet. It would have to move into the swifterror
- // register before the call.
- if (TLI.supportSwiftError() &&
- Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
- isTailCall = false;
- }
+ if (isTailCall)
+ isTailCall = canTailCall(CB);
for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
TargetLowering::ArgListEntry Entry;
@@ -8952,11 +8967,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
Args.push_back(Entry);
}
- // Check if target-independent constraints permit a tail call here.
- // Target-dependent constraints are checked within TLI->LowerCallTo.
- if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget()))
- isTailCall = false;
-
// Disable tail calls if there is an swifterror argument. Targets have not
// been updated to support tail calls.
if (TLI.supportSwiftError() && SwiftErrorVal)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 1c278076a219d..58d9f04c61156 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -408,6 +408,10 @@ class SelectionDAGBuilder {
bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr,
const TargetLowering::PtrAuthInfo *PAI = nullptr);
+ // Check some of the target-independent constraints for tail calls. This does
+ // not iterate over the call arguments.
+ bool canTailCall(const CallBase &CB) const;
+
// Lower range metadata from 0 to N to assert zext to an integer of nearest
// floor power of two.
SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a704a76502b6d..f564b7cd4dd20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -993,8 +993,14 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
}
- return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
- AMDGPU::SI_TCRETURN;
+ if (CallerF.getFunction().getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave)
+ return AMDGPU::SI_TCRETURN_GFX_WholeWave;
+
+ if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave)
+ return AMDGPU::SI_TCRETURN_GFX;
+
+ return AMDGPU::SI_TCRETURN;
}
// Add operands to call instruction to track the callee.
@@ -1273,6 +1279,13 @@ bool AMDGPUCallLowering::lowerTailCall(
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true,
ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+
+ if (FuncInfo->isWholeWaveFunction())
+ addOriginalExecToReturn(MF, MIB);
+
+ // Keep track of the index of the next operand to be added to the call
+ unsigned CalleeIdx = MIB->getNumOperands();
+
if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
return false;
@@ -1390,7 +1403,7 @@ bool AMDGPUCallLowering::lowerTailCall(
// If we have -tailcallopt, we need to adjust the stack. We'll do the call
// sequence start and end here.
if (!IsSibCall) {
- MIB->getOperand(1).setImm(FPDiff);
+ MIB->getOperand(CalleeIdx + 1).setImm(FPDiff);
CallSeqStart.addImm(NumBytes).addImm(0);
// End the call sequence *before* emitting the call. Normally, we would
// tidy the frame up after the call. However, here, we've laid out the
@@ -1402,16 +1415,24 @@ bool AMDGPUCallLowering::lowerTailCall(
// Now we can add the actual call instruction to the correct basic block.
MIRBuilder.insertInstr(MIB);
+ // If this is a whole wave tail call, we need to constrain the register for
+ // the original EXEC.
+ if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
+ MIB->getOperand(0).setReg(
+ constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
+ *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+ }
+
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
// FIXME: We should define regbankselectable call instructions to handle
// divergent call targets.
- if (MIB->getOperand(0).isReg()) {
- MIB->getOperand(0).setReg(
- constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
- *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+ if (MIB->getOperand(CalleeIdx).isReg()) {
+ MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
+ MIB->getOperand(CalleeIdx), CalleeIdx));
}
MF.getFrameInfo().setHasTailCall();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f7dba4d3fb892..dfbcba637ef9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5578,6 +5578,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
+ NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
NODE_NAME_CASE(TRAP)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5716711de3402..120fa819e8a55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -402,6 +402,7 @@ enum NodeType : unsigned {
CALL,
TC_RETURN,
TC_RETURN_GFX,
+ TC_RETURN_GFX_WholeWave,
TC_RETURN_CHAIN,
TC_RETURN_CHAIN_DVGPR,
TRAP,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e305f08925cc6..b8fa6f3fc6867 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -94,6 +94,10 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
+def AMDGPUtc_return_gfx_ww: SDNode<"AMDGPUISD::TC_RETURN_GFX_WholeWave", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index b88df50c6c999..3caeda651f96b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1125,9 +1125,18 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMCalleeSavedRegs);
// The original EXEC is the first operand of the return instruction.
- const MachineInstr &Return = MBB.instr_back();
- assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
- "Unexpected return inst");
+ MachineInstr &Return = MBB.instr_back();
+ unsigned Opcode = Return.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+ Opcode = AMDGPU::SI_RETURN;
+ break;
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
+ Opcode = AMDGPU::SI_TCRETURN_GFX;
+ break;
+ default:
+ llvm_unreachable("Unexpected return inst");
+ }
Register OrigExec = Return.getOperand(0).getReg();
if (!WWMScratchRegs.empty()) {
@@ -1141,6 +1150,11 @@ void SIFrameLowering::emitCSRSpillRestores(
// Restore original EXEC.
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+
+ // Drop the first operand and update the opcode.
+ Return.removeOperand(0);
+ Return.setDesc(TII->get(Opcode));
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3cbaa7d8cb4ad..40ea309dd8a52 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4130,6 +4130,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
break;
}
+ // If the caller is a whole wave function, we need to use a special opcode
+ // so we can patch up EXEC.
+ if (Info->isWholeWaveFunction())
+ OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
+
return DAG.getNode(OPC, DL, MVT::Other, Ops);
}
@@ -5871,6 +5876,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
assert(MFI->isWholeWaveFunction());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 436bed468c444..f3b413f060708 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2515,7 +2515,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 225a073db33d1..ca77573311ffa 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -670,6 +670,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
def : GCNPat<
(AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN.
+// This is used for tail calls *from* a whole wave function. Tail calls to
+// a whole wave function may use the usual opcodes, depending on the calling
+// convention of the caller.
+def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI <
+ (outs),
+ (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+ let isCall = 1;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+ let isConvergent = 1;
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff),
+ (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0,
+ tglobaladdr:$callee, i32:$fpdiff)>;
+
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2f15d8d58c5cf..13a20c4f0d405 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1412,6 +1412,7 @@ constexpr bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return true;
default:
return canGuaranteeTCO(CC);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index eac0767c88d80..356bf4b3cac28 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -96,6 +96,672 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
ret void
}
+define amdgpu_gfx i32 @tail_call_from_gfx(i32 %x, i32 inreg %c) {
+; DAGISEL-LABEL: tail_call_from_gfx:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[0:1]
+;
+; GISEL-LABEL: tail_call_from_gfx:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; GISEL-NEXT: s_mov_b32 s36, good_callee@abs32@lo
+; GISEL-NEXT: s_mov_b32 s37, good_callee@abs32@hi
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[36:37]
+ %y = add i32 %x, 13
+ %ret = tail call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
+ ret i32 %ret
+}
+
+define amdgpu_gfx_whole_wave i32 @tail_call_from_whole_wave(i1 %active, i32 %x, i32 inreg %c) {
+; DAGISEL-LABEL: tail_call_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:24
+; DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:28
+; DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:32
+; DAGISEL-NEXT: scratch_store_b32 off, v9, s32 offset:36
+; DAGISEL-NEXT: scratch_store_b32 off, v10, s32 offset:40
+; DAGISEL-NEXT: scratch_store_b32 off, v11, s32 offset:44
+; DAGISEL-NEXT: scratch_store_b32 off, v12, s32 offset:48
+; DAGISEL-NEXT: scratch_store_b32 off, v13, s32 offset:52
+; DAGISEL-NEXT: scratch_store_b32 off, v14, s32 offset:56
+; DAGISEL-NEXT: scratch_store_b32 off, v15, s32 offset:60
+; DAGISEL-NEXT: scratch_store_b32 off, v16, s32 offset:64
+; DAGISEL-NEXT: scratch_store_b32 off, v17, s32 offset:68
+; DAGISEL-NEXT: scratch_store_b32 off, v18, s32 offset:72
+; DAGISEL-NEXT: scratch_store_b32 off, v19, s32 offset:76
+; DAGISEL-NEXT: scratch_store_b32 off, v20, s32 offset:80
+; DAGISEL-NEXT: scratch_store_b32 off, v21, s32 offset:84
+; DAGISEL-NEXT: scratch_store_b32 off, v22, s32 offset:88
+; DAGISEL-NEXT: scratch_store_b32 off, v23, s32 offset:92
+; DAGISEL-NEXT: scratch_store_b32 off, v24, s32 offset:96
+; DAGISEL-NEXT: scratch_store_b32 off, v25, s32 offset:100
+; DAGISEL-NEXT: scratch_store_b32 off, v26, s32 offset:104
+; DAGISEL-NEXT: scratch_store_b32 off, v27, s32 offset:108
+; DAGISEL-NEXT: scratch_store_b32 off, v28, s32 offset:112
+; DAGISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116
+; DAGISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120
+; DAGISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128
+; DAGISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132
+; DAGISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136
+; DAGISEL-NEXT: scratch_store_b32 off, v35, s32 offset:140
+; DAGISEL-NEXT: scratch_store_b32 off, v36, s32 offset:144
+; DAGISEL-NEXT: scratch_store_b32 off, v37, s32 offset:148
+; DAGISEL-NEXT: scratch_store_b32 off, v38, s32 offset:152
+; DAGISEL-NEXT: scratch_store_b32 off, v39, s32 offset:156
+; DAGISEL-NEXT: scratch_store_b32 off, v48, s32 offset:160
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:164
+; DAGISEL-NEXT: scratch_store_b32 off, v50, s32 offset:168
+; DAGISEL-NEXT: scratch_store_b32 off, v51, s32 offset:172
+; DAGISEL-NEXT: scratch_store_b32 off, v52, s32 offset:176
+; DAGISEL-NEXT: scratch_store_b32 off, v53, s32 offset:180
+; DAGISEL-NEXT: scratch_store_b32 off, v54, s32 offset:184
+; DAGISEL-NEXT: scratch_store_b32 off, v55, s32 offset:188
+; DAGISEL-NEXT: scratch_store_b32 off, v64, s32 offset:192
+; DAGISEL-NEXT: scratch_store_b32 off, v65, s32 offset:196
+; DAGISEL-NEXT: scratch_store_b32 off, v66, s32 offset:200
+; DAGISEL-NEXT: scratch_store_b32 off, v67, s32 offset:204
+; DAGISEL-NEXT: scratch_store_b32 o...
[truncated]
|
@llvm/pr-subscribers-llvm-selectiondag Author: Diana Picus (rovka) ChangesSupport tail calls to whole wave functions (trivial) and from whole wave Move the expansion of whole wave function return pseudos (regular and Unnecessary register spills will be dealt with in a future patch. Patch is 154.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/145860.diff 15 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2310d511b1df8..0e27b0f764795 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7977,6 +7977,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
case Intrinsic::amdgcn_call_whole_wave: {
TargetLowering::ArgListTy Args;
+ bool isTailCall = I.isTailCall();
// The first argument is the callee. Skip it when assembling the call args.
TargetLowering::ArgListEntry Arg;
@@ -7984,6 +7985,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
Arg.Node = getValue(I.getArgOperand(Idx));
Arg.Ty = I.getArgOperand(Idx)->getType();
Arg.setAttributes(&I, Idx);
+
+ if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx)))
+ isTailCall = false;
+
Args.push_back(Arg);
}
@@ -7998,7 +8003,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
.setChain(getRoot())
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
getValue(I.getArgOperand(0)), std::move(Args))
- .setTailCall(false)
+ .setTailCall(isTailCall && canTailCall(I))
.setIsPreallocated(
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
.setConvergent(I.isConvergent())
@@ -8879,6 +8884,29 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
return Result;
}
+bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
+ bool isMustTailCall = CB.isMustTailCall();
+
+ // Avoid emitting tail calls in functions with the disable-tail-calls
+ // attribute.
+ auto *Caller = CB.getParent()->getParent();
+ if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
+ "true" &&
+ !isMustTailCall)
+ return false;
+
+ // We can't tail call inside a function with a swifterror argument. Lowering
+ // does not support this yet. It would have to move into the swifterror
+ // register before the call.
+ if (DAG.getTargetLoweringInfo().supportSwiftError() &&
+ Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+ return false;
+
+ // Check if target-independent constraints permit a tail call here.
+ // Target-dependent constraints are checked within TLI->LowerCallTo.
+ return isInTailCallPosition(CB, DAG.getTarget());
+}
+
void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
bool isTailCall, bool isMustTailCall,
const BasicBlock *EHPadBB,
@@ -8893,21 +8921,8 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
const Value *SwiftErrorVal = nullptr;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (isTailCall) {
- // Avoid emitting tail calls in functions with the disable-tail-calls
- // attribute.
- auto *Caller = CB.getParent()->getParent();
- if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
- "true" && !isMustTailCall)
- isTailCall = false;
-
- // We can't tail call inside a function with a swifterror argument. Lowering
- // does not support this yet. It would have to move into the swifterror
- // register before the call.
- if (TLI.supportSwiftError() &&
- Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
- isTailCall = false;
- }
+ if (isTailCall)
+ isTailCall = canTailCall(CB);
for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
TargetLowering::ArgListEntry Entry;
@@ -8952,11 +8967,6 @@ void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
Args.push_back(Entry);
}
- // Check if target-independent constraints permit a tail call here.
- // Target-dependent constraints are checked within TLI->LowerCallTo.
- if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget()))
- isTailCall = false;
-
// Disable tail calls if there is an swifterror argument. Targets have not
// been updated to support tail calls.
if (TLI.supportSwiftError() && SwiftErrorVal)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 1c278076a219d..58d9f04c61156 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -408,6 +408,10 @@ class SelectionDAGBuilder {
bool IsMustTailCall, const BasicBlock *EHPadBB = nullptr,
const TargetLowering::PtrAuthInfo *PAI = nullptr);
+ // Check some of the target-independent constraints for tail calls. This does
+ // not iterate over the call arguments.
+ bool canTailCall(const CallBase &CB) const;
+
// Lower range metadata from 0 to N to assert zext to an integer of nearest
// floor power of two.
SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index a704a76502b6d..f564b7cd4dd20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -993,8 +993,14 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
return IsWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
}
- return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
- AMDGPU::SI_TCRETURN;
+ if (CallerF.getFunction().getCallingConv() ==
+ CallingConv::AMDGPU_Gfx_WholeWave)
+ return AMDGPU::SI_TCRETURN_GFX_WholeWave;
+
+ if (CC == CallingConv::AMDGPU_Gfx || CC == CallingConv::AMDGPU_Gfx_WholeWave)
+ return AMDGPU::SI_TCRETURN_GFX;
+
+ return AMDGPU::SI_TCRETURN;
}
// Add operands to call instruction to track the callee.
@@ -1273,6 +1279,13 @@ bool AMDGPUCallLowering::lowerTailCall(
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), /*IsTailCall*/ true,
ST.isWave32(), CalleeCC, IsDynamicVGPRChainCall);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
+
+ if (FuncInfo->isWholeWaveFunction())
+ addOriginalExecToReturn(MF, MIB);
+
+ // Keep track of the index of the next operand to be added to the call
+ unsigned CalleeIdx = MIB->getNumOperands();
+
if (!addCallTargetOperands(MIB, MIRBuilder, Info, IsDynamicVGPRChainCall))
return false;
@@ -1390,7 +1403,7 @@ bool AMDGPUCallLowering::lowerTailCall(
// If we have -tailcallopt, we need to adjust the stack. We'll do the call
// sequence start and end here.
if (!IsSibCall) {
- MIB->getOperand(1).setImm(FPDiff);
+ MIB->getOperand(CalleeIdx + 1).setImm(FPDiff);
CallSeqStart.addImm(NumBytes).addImm(0);
// End the call sequence *before* emitting the call. Normally, we would
// tidy the frame up after the call. However, here, we've laid out the
@@ -1402,16 +1415,24 @@ bool AMDGPUCallLowering::lowerTailCall(
// Now we can add the actual call instruction to the correct basic block.
MIRBuilder.insertInstr(MIB);
+ // If this is a whole wave tail call, we need to constrain the register for
+ // the original EXEC.
+ if (MIB->getOpcode() == AMDGPU::SI_TCRETURN_GFX_WholeWave) {
+ MIB->getOperand(0).setReg(
+ constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
+ *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+ }
+
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
// constraint of that instruction.
// FIXME: We should define regbankselectable call instructions to handle
// divergent call targets.
- if (MIB->getOperand(0).isReg()) {
- MIB->getOperand(0).setReg(
- constrainOperandRegClass(MF, *TRI, MRI, *TII, *ST.getRegBankInfo(),
- *MIB, MIB->getDesc(), MIB->getOperand(0), 0));
+ if (MIB->getOperand(CalleeIdx).isReg()) {
+ MIB->getOperand(CalleeIdx).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *TII, *ST.getRegBankInfo(), *MIB, MIB->getDesc(),
+ MIB->getOperand(CalleeIdx), CalleeIdx));
}
MF.getFrameInfo().setHasTailCall();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f7dba4d3fb892..dfbcba637ef9a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5578,6 +5578,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
+ NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
NODE_NAME_CASE(TRAP)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 5716711de3402..120fa819e8a55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -402,6 +402,7 @@ enum NodeType : unsigned {
CALL,
TC_RETURN,
TC_RETURN_GFX,
+ TC_RETURN_GFX_WholeWave,
TC_RETURN_CHAIN,
TC_RETURN_CHAIN_DVGPR,
TRAP,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index e305f08925cc6..b8fa6f3fc6867 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -94,6 +94,10 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
+def AMDGPUtc_return_gfx_ww: SDNode<"AMDGPUISD::TC_RETURN_GFX_WholeWave", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index b88df50c6c999..3caeda651f96b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1125,9 +1125,18 @@ void SIFrameLowering::emitCSRSpillRestores(
RestoreWWMRegisters(WWMCalleeSavedRegs);
// The original EXEC is the first operand of the return instruction.
- const MachineInstr &Return = MBB.instr_back();
- assert(Return.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN &&
- "Unexpected return inst");
+ MachineInstr &Return = MBB.instr_back();
+ unsigned Opcode = Return.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
+ Opcode = AMDGPU::SI_RETURN;
+ break;
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
+ Opcode = AMDGPU::SI_TCRETURN_GFX;
+ break;
+ default:
+ llvm_unreachable("Unexpected return inst");
+ }
Register OrigExec = Return.getOperand(0).getReg();
if (!WWMScratchRegs.empty()) {
@@ -1141,6 +1150,11 @@ void SIFrameLowering::emitCSRSpillRestores(
// Restore original EXEC.
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addReg(OrigExec);
+
+ // Drop the first operand and update the opcode.
+ Return.removeOperand(0);
+ Return.setDesc(TII->get(Opcode));
+
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3cbaa7d8cb4ad..40ea309dd8a52 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4130,6 +4130,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
break;
}
+ // If the caller is a whole wave function, we need to use a special opcode
+ // so we can patch up EXEC.
+ if (Info->isWholeWaveFunction())
+ OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
+
return DAG.getNode(OPC, DL, MVT::Other, Ops);
}
@@ -5871,6 +5876,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.eraseFromParent();
return SplitBB;
}
+ case AMDGPU::SI_TCRETURN_GFX_WholeWave:
case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
assert(MFI->isWholeWaveFunction());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 436bed468c444..f3b413f060708 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2515,7 +2515,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
break;
}
- case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
case AMDGPU::SI_RETURN: {
const MachineFunction *MF = MBB.getParent();
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 225a073db33d1..ca77573311ffa 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -670,6 +670,33 @@ def SI_WHOLE_WAVE_FUNC_RETURN : SPseudoInstSI <
def : GCNPat<
(AMDGPUwhole_wave_return), (SI_WHOLE_WAVE_FUNC_RETURN (i1 (IMPLICIT_DEF)))>;
+// Restores the previous EXEC and otherwise behaves entirely like a SI_TCRETURN.
+// This is used for tail calls *from* a whole wave function. Tail calls to
+// a whole wave function may use the usual opcodes, depending on the calling
+// convention of the caller.
+def SI_TCRETURN_GFX_WholeWave : SPseudoInstSI <
+ (outs),
+ (ins SReg_1:$orig_exec, Gfx_CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff)> {
+ let isCall = 1;
+ let isTerminator = 1;
+ let isReturn = 1;
+ let isBarrier = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+ let isConvergent = 1;
+
+ // We're going to use custom handling to set the $orig_exec to the correct value.
+ let usesCustomInserter = 1;
+}
+
+// Generate a SI_TCRETURN_GFX_WholeWave pseudo with a placeholder for its
+// argument. It will be filled in by the custom inserter.
+def : GCNPat<
+ (AMDGPUtc_return_gfx_ww i64:$src0, tglobaladdr:$callee, i32:$fpdiff),
+ (SI_TCRETURN_GFX_WholeWave (i1 (IMPLICIT_DEF)), Gfx_CCR_SGPR_64:$src0,
+ tglobaladdr:$callee, i32:$fpdiff)>;
+
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2f15d8d58c5cf..13a20c4f0d405 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1412,6 +1412,7 @@ constexpr bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_Gfx_WholeWave:
return true;
default:
return canGuaranteeTCO(CC);
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
index eac0767c88d80..356bf4b3cac28 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll
@@ -96,6 +96,672 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr)
ret void
}
+define amdgpu_gfx i32 @tail_call_from_gfx(i32 %x, i32 inreg %c) {
+; DAGISEL-LABEL: tail_call_from_gfx:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
+; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
+; DAGISEL-NEXT: s_wait_alu 0xfffe
+; DAGISEL-NEXT: s_setpc_b64 s[0:1]
+;
+; GISEL-LABEL: tail_call_from_gfx:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_expcnt 0x0
+; GISEL-NEXT: s_wait_samplecnt 0x0
+; GISEL-NEXT: s_wait_bvhcnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
+; GISEL-NEXT: s_mov_b32 s36, good_callee@abs32@lo
+; GISEL-NEXT: s_mov_b32 s37, good_callee@abs32@hi
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_setpc_b64 s[36:37]
+ %y = add i32 %x, 13
+ %ret = tail call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
+ ret i32 %ret
+}
+
+define amdgpu_gfx_whole_wave i32 @tail_call_from_whole_wave(i1 %active, i32 %x, i32 inreg %c) {
+; DAGISEL-LABEL: tail_call_from_whole_wave:
+; DAGISEL: ; %bb.0:
+; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; DAGISEL-NEXT: s_wait_expcnt 0x0
+; DAGISEL-NEXT: s_wait_samplecnt 0x0
+; DAGISEL-NEXT: s_wait_bvhcnt 0x0
+; DAGISEL-NEXT: s_wait_kmcnt 0x0
+; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v0, s32
+; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4
+; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8
+; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12
+; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16
+; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20
+; DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:24
+; DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:28
+; DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:32
+; DAGISEL-NEXT: scratch_store_b32 off, v9, s32 offset:36
+; DAGISEL-NEXT: scratch_store_b32 off, v10, s32 offset:40
+; DAGISEL-NEXT: scratch_store_b32 off, v11, s32 offset:44
+; DAGISEL-NEXT: scratch_store_b32 off, v12, s32 offset:48
+; DAGISEL-NEXT: scratch_store_b32 off, v13, s32 offset:52
+; DAGISEL-NEXT: scratch_store_b32 off, v14, s32 offset:56
+; DAGISEL-NEXT: scratch_store_b32 off, v15, s32 offset:60
+; DAGISEL-NEXT: scratch_store_b32 off, v16, s32 offset:64
+; DAGISEL-NEXT: scratch_store_b32 off, v17, s32 offset:68
+; DAGISEL-NEXT: scratch_store_b32 off, v18, s32 offset:72
+; DAGISEL-NEXT: scratch_store_b32 off, v19, s32 offset:76
+; DAGISEL-NEXT: scratch_store_b32 off, v20, s32 offset:80
+; DAGISEL-NEXT: scratch_store_b32 off, v21, s32 offset:84
+; DAGISEL-NEXT: scratch_store_b32 off, v22, s32 offset:88
+; DAGISEL-NEXT: scratch_store_b32 off, v23, s32 offset:92
+; DAGISEL-NEXT: scratch_store_b32 off, v24, s32 offset:96
+; DAGISEL-NEXT: scratch_store_b32 off, v25, s32 offset:100
+; DAGISEL-NEXT: scratch_store_b32 off, v26, s32 offset:104
+; DAGISEL-NEXT: scratch_store_b32 off, v27, s32 offset:108
+; DAGISEL-NEXT: scratch_store_b32 off, v28, s32 offset:112
+; DAGISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116
+; DAGISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120
+; DAGISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124
+; DAGISEL-NEXT: s_clause 0x1f
+; DAGISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128
+; DAGISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132
+; DAGISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136
+; DAGISEL-NEXT: scratch_store_b32 off, v35, s32 offset:140
+; DAGISEL-NEXT: scratch_store_b32 off, v36, s32 offset:144
+; DAGISEL-NEXT: scratch_store_b32 off, v37, s32 offset:148
+; DAGISEL-NEXT: scratch_store_b32 off, v38, s32 offset:152
+; DAGISEL-NEXT: scratch_store_b32 off, v39, s32 offset:156
+; DAGISEL-NEXT: scratch_store_b32 off, v48, s32 offset:160
+; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:164
+; DAGISEL-NEXT: scratch_store_b32 off, v50, s32 offset:168
+; DAGISEL-NEXT: scratch_store_b32 off, v51, s32 offset:172
+; DAGISEL-NEXT: scratch_store_b32 off, v52, s32 offset:176
+; DAGISEL-NEXT: scratch_store_b32 off, v53, s32 offset:180
+; DAGISEL-NEXT: scratch_store_b32 off, v54, s32 offset:184
+; DAGISEL-NEXT: scratch_store_b32 off, v55, s32 offset:188
+; DAGISEL-NEXT: scratch_store_b32 off, v64, s32 offset:192
+; DAGISEL-NEXT: scratch_store_b32 off, v65, s32 offset:196
+; DAGISEL-NEXT: scratch_store_b32 off, v66, s32 offset:200
+; DAGISEL-NEXT: scratch_store_b32 off, v67, s32 offset:204
+; DAGISEL-NEXT: scratch_store_b32 o...
[truncated]
|
if (Arg.IsSRet && isa<Instruction>(I.getArgOperand(Idx))) | ||
isTailCall = false; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you include this in that isEligibleFoTailCall function instead? And comment it?
Support tail calls to whole wave functions (trivial) and from whole wave functions (slightly more involved because we need a new pseudo for the tail call return, that patches up the EXEC mask). Move the expansion of whole wave function return pseudos (regular and tail call returns) to prolog epilog insertion, since that's where we patch up the EXEC mask. Unnecessary register spills will be dealt with in a future patch.
3cc5557
to
a67a2d4
Compare
4594737
to
7b68ddf
Compare
Support tail calls to whole wave functions (trivial) and from whole wave
functions (slightly more involved because we need a new pseudo for the
tail call return, that patches up the EXEC mask).
Move the expansion of whole wave function return pseudos (regular and
tail call returns) to prolog epilog insertion, since that's where we
patch up the EXEC mask.
Unnecessary register spills will be dealt with in a future patch.