Skip to content

Commit 3cc5557

Browse files
committed
[AMDGPU] Intrinsic for launching whole wave functions
Add the llvm.amdgcn.call.whole.wave intrinsic for calling whole wave functions. This will take as its first argument the callee with the amdgpu_gfx_whole_wave calling convention, followed by the call parameters which must match the signature of the callee except for the first function argument (the i1 original EXEC mask, which doens't need to be passed in). Indirect calls are not allowed. Make direct calls to amdgpu_gfx_whole_wave functions a verifier error. Unspeakable horrors happen around calls from whole wave functions, the plan is to improve the handling of caller/callee-saved registers in a future patch. Tail calls are also handled in a future patch.
1 parent 0eb6c66 commit 3cc5557

File tree

11 files changed

+1854
-3
lines changed

11 files changed

+1854
-3
lines changed

llvm/include/llvm/IR/CallingConv.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,8 +297,13 @@ namespace CallingConv {
297297
/// directly or indirectly via a call-like instruction.
298298
constexpr bool isCallableCC(CallingConv::ID CC) {
299299
switch (CC) {
300+
// Called with special intrinsics:
301+
// llvm.amdgcn.cs.chain
300302
case CallingConv::AMDGPU_CS_Chain:
301303
case CallingConv::AMDGPU_CS_ChainPreserve:
304+
// llvm.amdgcn.call.whole.wave
305+
case CallingConv::AMDGPU_Gfx_WholeWave:
306+
// Hardware entry points:
302307
case CallingConv::AMDGPU_CS:
303308
case CallingConv::AMDGPU_ES:
304309
case CallingConv::AMDGPU_GS:

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2572,6 +2572,18 @@ def int_amdgcn_cs_chain:
25722572
],
25732573
[IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>;
25742574

2575+
// Run a function with all the lanes enabled. Only direct calls are allowed. The
2576+
// first argument is the callee, which must have the `amdgpu_gfx_whole_wave`
2577+
// calling convention and must not be variadic. The remaining arguments to the
2578+
// callee are taken from the arguments passed to the intrinsic. Lanes that are
2579+
// inactive at the point of the call will receive poison. The return value is
2580+
// the return value of the callee for the active lanes and poison for the
2581+
// inactive ones.
2582+
def int_amdgcn_call_whole_wave:
2583+
Intrinsic<[llvm_any_ty], // The return type of the callee.
2584+
[llvm_anyptr_ty, // The callee.
2585+
llvm_vararg_ty], // The arguments to the callee.
2586+
[IntrConvergent, IntrNoReturn, IntrNoCallback, IntrNoFree]>;
25752587

25762588
//===----------------------------------------------------------------------===//
25772589
// CI+ Intrinsics

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2548,6 +2548,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
25482548
getOrCreateVReg(*ConstantInt::getTrue(CI.getType())));
25492549
return true;
25502550
case Intrinsic::amdgcn_cs_chain:
2551+
case Intrinsic::amdgcn_call_whole_wave:
25512552
return translateCallBase(CI, MIRBuilder);
25522553
case Intrinsic::fptrunc_round: {
25532554
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7975,6 +7975,43 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
79757975
HasTailCall = true;
79767976
return;
79777977
}
7978+
case Intrinsic::amdgcn_call_whole_wave: {
7979+
TargetLowering::ArgListTy Args;
7980+
7981+
// The first argument is the callee. Skip it when assembling the call args.
7982+
TargetLowering::ArgListEntry Arg;
7983+
for (unsigned Idx = 1; Idx < I.arg_size(); ++Idx) {
7984+
Arg.Node = getValue(I.getArgOperand(Idx));
7985+
Arg.Ty = I.getArgOperand(Idx)->getType();
7986+
Arg.setAttributes(&I, Idx);
7987+
Args.push_back(Arg);
7988+
}
7989+
7990+
SDValue ConvControlToken;
7991+
if (auto Bundle = I.getOperandBundle(LLVMContext::OB_convergencectrl)) {
7992+
auto *Token = Bundle->Inputs[0].get();
7993+
ConvControlToken = getValue(Token);
7994+
}
7995+
7996+
TargetLowering::CallLoweringInfo CLI(DAG);
7997+
CLI.setDebugLoc(getCurSDLoc())
7998+
.setChain(getRoot())
7999+
.setCallee(CallingConv::AMDGPU_Gfx_WholeWave, I.getType(),
8000+
getValue(I.getArgOperand(0)), std::move(Args))
8001+
.setTailCall(false)
8002+
.setIsPreallocated(
8003+
I.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0)
8004+
.setConvergent(I.isConvergent())
8005+
.setConvergenceControlToken(ConvControlToken);
8006+
CLI.CB = &I;
8007+
8008+
std::pair<SDValue, SDValue> Result =
8009+
lowerInvokable(CLI, /*EHPadBB*/ nullptr);
8010+
8011+
if (Result.first.getNode())
8012+
setValue(&I, Result.first);
8013+
return;
8014+
}
79788015
case Intrinsic::ptrmask: {
79798016
SDValue Ptr = getValue(I.getOperand(0));
79808017
SDValue Mask = getValue(I.getOperand(1));

llvm/lib/IR/Verifier.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6504,6 +6504,36 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
65046504
"Value for inactive lanes must be a VGPR function argument", &Call);
65056505
break;
65066506
}
6507+
case Intrinsic::amdgcn_call_whole_wave: {
6508+
auto F = dyn_cast<Function>(Call.getArgOperand(0));
6509+
Check(F, "Indirect whole wave calls are not allowed", &Call);
6510+
6511+
CallingConv::ID CC = F->getCallingConv();
6512+
Check(CC == CallingConv::AMDGPU_Gfx_WholeWave,
6513+
"Callee must have the amdgpu_gfx_whole_wave calling convention",
6514+
&Call);
6515+
6516+
Check(!F->isVarArg(), "Variadic whole wave calls are not allowed", &Call);
6517+
6518+
Check(Call.arg_size() == F->arg_size(),
6519+
"Call argument count must match callee argument count", &Call);
6520+
6521+
// The first argument of the call is the callee, and the first argument of
6522+
// the callee is the active mask. The rest of the arguments must match.
6523+
Check(F->arg_begin()->getType()->isIntegerTy(1),
6524+
"Callee must have i1 as its first argument", &Call);
6525+
for (auto [CallArg, FuncArg] :
6526+
drop_begin(zip_equal(Call.args(), F->args()))) {
6527+
Check(CallArg->getType() == FuncArg.getType(),
6528+
"Argument types must match", &Call);
6529+
6530+
// Check that inreg attributes match between call site and function
6531+
Check(Call.paramHasAttr(FuncArg.getArgNo(), Attribute::InReg) ==
6532+
FuncArg.hasInRegAttr(),
6533+
"Argument inreg attributes must match", &Call);
6534+
}
6535+
break;
6536+
}
65076537
case Intrinsic::amdgcn_s_prefetch_data: {
65086538
Check(
65096539
AMDGPU::isFlatGlobalAddrSpace(

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,9 +1465,22 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
14651465
CallLoweringInfo &Info) const {
14661466
if (Function *F = Info.CB->getCalledFunction())
14671467
if (F->isIntrinsic()) {
1468-
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1469-
"Unexpected intrinsic");
1470-
return lowerChainCall(MIRBuilder, Info);
1468+
switch (F->getIntrinsicID()) {
1469+
case Intrinsic::amdgcn_cs_chain:
1470+
return lowerChainCall(MIRBuilder, Info);
1471+
case Intrinsic::amdgcn_call_whole_wave:
1472+
Info.CallConv = CallingConv::AMDGPU_Gfx_WholeWave;
1473+
1474+
// Get the callee from the original instruction, so it doesn't look like
1475+
// this is an indirect call.
1476+
Info.Callee = MachineOperand::CreateGA(
1477+
static_cast<GlobalValue *>(Info.CB->getOperand(0)), /*Offset=*/0);
1478+
Info.OrigArgs.erase(Info.OrigArgs.begin());
1479+
Info.IsVarArg = false;
1480+
break;
1481+
default:
1482+
llvm_unreachable("Unexpected intrinsic call");
1483+
}
14711484
}
14721485

14731486
if (Info.IsVarArg) {
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL
4+
5+
declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c)
6+
7+
define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) {
8+
; DAGISEL-LABEL: basic_test:
9+
; DAGISEL: ; %bb.0:
10+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
11+
; DAGISEL-NEXT: s_wait_expcnt 0x0
12+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
13+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
14+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
15+
; DAGISEL-NEXT: s_mov_b32 s0, s33
16+
; DAGISEL-NEXT: s_mov_b32 s33, s32
17+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
18+
; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
19+
; DAGISEL-NEXT: s_wait_alu 0xfffe
20+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
21+
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2
22+
; DAGISEL-NEXT: s_clause 0x1
23+
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
24+
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33
25+
; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1
26+
; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
27+
; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0
28+
; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
29+
; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
30+
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
31+
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1
32+
; DAGISEL-NEXT: s_wait_alu 0xfffe
33+
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
34+
; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off
35+
; DAGISEL-NEXT: s_clause 0x1
36+
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33
37+
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
38+
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1
39+
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0
40+
; DAGISEL-NEXT: s_mov_b32 s32, s33
41+
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2
42+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
43+
; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
44+
; DAGISEL-NEXT: s_wait_alu 0xfffe
45+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
46+
; DAGISEL-NEXT: s_mov_b32 s33, s0
47+
; DAGISEL-NEXT: s_wait_loadcnt 0x0
48+
; DAGISEL-NEXT: s_wait_alu 0xfffe
49+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
50+
;
51+
; GISEL-LABEL: basic_test:
52+
; GISEL: ; %bb.0:
53+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
54+
; GISEL-NEXT: s_wait_expcnt 0x0
55+
; GISEL-NEXT: s_wait_samplecnt 0x0
56+
; GISEL-NEXT: s_wait_bvhcnt 0x0
57+
; GISEL-NEXT: s_wait_kmcnt 0x0
58+
; GISEL-NEXT: s_mov_b32 s0, s33
59+
; GISEL-NEXT: s_mov_b32 s33, s32
60+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
61+
; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
62+
; GISEL-NEXT: s_wait_alu 0xfffe
63+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
64+
; GISEL-NEXT: v_writelane_b32 v42, s0, 2
65+
; GISEL-NEXT: s_clause 0x1
66+
; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4
67+
; GISEL-NEXT: scratch_store_b32 off, v41, s33
68+
; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2
69+
; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0
70+
; GISEL-NEXT: v_writelane_b32 v42, s30, 0
71+
; GISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo
72+
; GISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi
73+
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
74+
; GISEL-NEXT: v_writelane_b32 v42, s31, 1
75+
; GISEL-NEXT: s_wait_alu 0xfffe
76+
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
77+
; GISEL-NEXT: global_store_b32 v[40:41], v0, off
78+
; GISEL-NEXT: s_clause 0x1
79+
; GISEL-NEXT: scratch_load_b32 v41, off, s33
80+
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4
81+
; GISEL-NEXT: v_readlane_b32 s31, v42, 1
82+
; GISEL-NEXT: v_readlane_b32 s30, v42, 0
83+
; GISEL-NEXT: s_mov_b32 s32, s33
84+
; GISEL-NEXT: v_readlane_b32 s0, v42, 2
85+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
86+
; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
87+
; GISEL-NEXT: s_wait_alu 0xfffe
88+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
89+
; GISEL-NEXT: s_mov_b32 s33, s0
90+
; GISEL-NEXT: s_wait_loadcnt 0x0
91+
; GISEL-NEXT: s_wait_alu 0xfffe
92+
; GISEL-NEXT: s_setpc_b64 s[30:31]
93+
%y = add i32 %x, 13
94+
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c)
95+
store i32 %ret, ptr addrspace(1) %ptr
96+
ret void
97+
}
98+
99+
declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x)
100+
101+
define amdgpu_gfx void @ret_void(i32 %x) {
102+
; DAGISEL-LABEL: ret_void:
103+
; DAGISEL: ; %bb.0:
104+
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0
105+
; DAGISEL-NEXT: s_wait_expcnt 0x0
106+
; DAGISEL-NEXT: s_wait_samplecnt 0x0
107+
; DAGISEL-NEXT: s_wait_bvhcnt 0x0
108+
; DAGISEL-NEXT: s_wait_kmcnt 0x0
109+
; DAGISEL-NEXT: s_mov_b32 s0, s33
110+
; DAGISEL-NEXT: s_mov_b32 s33, s32
111+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
112+
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
113+
; DAGISEL-NEXT: s_wait_alu 0xfffe
114+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
115+
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2
116+
; DAGISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
117+
; DAGISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
118+
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16
119+
; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0
120+
; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1
121+
; DAGISEL-NEXT: s_wait_alu 0xfffe
122+
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
123+
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
124+
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1
125+
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0
126+
; DAGISEL-NEXT: s_mov_b32 s32, s33
127+
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2
128+
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1
129+
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
130+
; DAGISEL-NEXT: s_wait_alu 0xfffe
131+
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1
132+
; DAGISEL-NEXT: s_mov_b32 s33, s0
133+
; DAGISEL-NEXT: s_wait_loadcnt 0x0
134+
; DAGISEL-NEXT: s_wait_alu 0xfffe
135+
; DAGISEL-NEXT: s_setpc_b64 s[30:31]
136+
;
137+
; GISEL-LABEL: ret_void:
138+
; GISEL: ; %bb.0:
139+
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
140+
; GISEL-NEXT: s_wait_expcnt 0x0
141+
; GISEL-NEXT: s_wait_samplecnt 0x0
142+
; GISEL-NEXT: s_wait_bvhcnt 0x0
143+
; GISEL-NEXT: s_wait_kmcnt 0x0
144+
; GISEL-NEXT: s_mov_b32 s0, s33
145+
; GISEL-NEXT: s_mov_b32 s33, s32
146+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
147+
; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
148+
; GISEL-NEXT: s_wait_alu 0xfffe
149+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
150+
; GISEL-NEXT: v_writelane_b32 v40, s0, 2
151+
; GISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo
152+
; GISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi
153+
; GISEL-NEXT: s_add_co_i32 s32, s32, 16
154+
; GISEL-NEXT: v_writelane_b32 v40, s30, 0
155+
; GISEL-NEXT: v_writelane_b32 v40, s31, 1
156+
; GISEL-NEXT: s_wait_alu 0xfffe
157+
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1]
158+
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
159+
; GISEL-NEXT: v_readlane_b32 s31, v40, 1
160+
; GISEL-NEXT: v_readlane_b32 s30, v40, 0
161+
; GISEL-NEXT: s_mov_b32 s32, s33
162+
; GISEL-NEXT: v_readlane_b32 s0, v40, 2
163+
; GISEL-NEXT: s_or_saveexec_b32 s1, -1
164+
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
165+
; GISEL-NEXT: s_wait_alu 0xfffe
166+
; GISEL-NEXT: s_mov_b32 exec_lo, s1
167+
; GISEL-NEXT: s_mov_b32 s33, s0
168+
; GISEL-NEXT: s_wait_loadcnt 0x0
169+
; GISEL-NEXT: s_wait_alu 0xfffe
170+
; GISEL-NEXT: s_setpc_b64 s[30:31]
171+
call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x)
172+
ret void
173+
}
174+

llvm/test/CodeGen/AMDGPU/irtranslator-whole-wave-functions.ll

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,29 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
101101
%ret = call i64 @llvm.amdgcn.update.dpp.i64(i64 %x, i64 %y, i32 1, i32 1, i32 1, i1 false)
102102
ret i64 %ret
103103
}
104+
105+
declare amdgpu_gfx_whole_wave i32 @callee(i1 %active, i32 %x)
106+
107+
; Make sure we don't pass the first argument (i1).
108+
define amdgpu_cs void @call(i32 %x, ptr %p) {
109+
; CHECK-LABEL: name: call
110+
; CHECK: bb.1 (%ir-block.0):
111+
; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
112+
; CHECK-NEXT: {{ $}}
113+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
114+
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
115+
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
116+
; CHECK-NEXT: [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
117+
; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
118+
; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc
119+
; CHECK-NEXT: [[GV1:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @callee
120+
; CHECK-NEXT: $vgpr0 = COPY [[COPY]](s32)
121+
; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[GV1]](p0), @callee, csr_amdgpu_si_gfx, implicit $vgpr0, implicit-def $vgpr0
122+
; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr0
123+
; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc
124+
; CHECK-NEXT: G_STORE [[COPY3]](s32), [[MV]](p0) :: (store (s32) into %ir.p)
125+
; CHECK-NEXT: S_ENDPGM 0
126+
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @callee, i32 %x) convergent
127+
store i32 %ret, ptr %p
128+
ret void
129+
}

0 commit comments

Comments
 (0)