Skip to content

Commit 68fea00

Browse files
authored
[SPIRV] Use AMDGPU ABI for AMDGCN flavoured SPIRV (#169865)
At the moment AMDGCN flavoured SPIRV uses the SPIRV ABI with some tweaks revolving around passing aggregates as direct. This is problematic in multiple ways: - it leads to divergence from code compiled for a concrete target, which makes it difficult to debug; - it incurs a run time cost, when dealing with larger aggregates; - it incurs a compile time cost, when dealing with larger aggregates. This patch switches over AMDGCN flavoured SPIRV to implement the AMDGPU ABI (except for dealing with variadic functions, which will be added in the future). One additional complication (and the primary motivation behind the current less than ideal state of affairs) stems from `byref`, which AMDGPU uses, not being expressible in SPIR-V. We deal with this by CodeGen-ing for `byref`, lowering it to the `FuncParamAttr ByVal` in SPIR-V, and restoring it when doing reverse translation from AMDGCN flavoured SPIR-V.
1 parent 11fd760 commit 68fea00

File tree

6 files changed

+676
-125
lines changed

6 files changed

+676
-125
lines changed

clang/lib/CodeGen/Targets/SPIR.cpp

Lines changed: 247 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
#include "ABIInfoImpl.h"
1010
#include "HLSLBufferLayoutBuilder.h"
1111
#include "TargetInfo.h"
12+
#include "clang/Basic/LangOptions.h"
13+
#include "llvm/IR/DerivedTypes.h"
14+
15+
#include <stdint.h>
16+
#include <utility>
1217

1318
using namespace clang;
1419
using namespace clang::CodeGen;
@@ -33,9 +38,43 @@ class SPIRVABIInfo : public CommonSPIRABIInfo {
3338
void computeInfo(CGFunctionInfo &FI) const override;
3439

3540
private:
41+
ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
42+
};
43+
44+
class AMDGCNSPIRVABIInfo : public SPIRVABIInfo {
45+
// TODO: this should be unified / shared with AMDGPU, ideally we'd like to
46+
// re-use AMDGPUABIInfo eventually, rather than duplicate.
47+
static constexpr unsigned MaxNumRegsForArgsRet = 16; // 16 32-bit registers
48+
mutable unsigned NumRegsLeft = 0;
49+
50+
unsigned numRegsForType(QualType Ty) const;
51+
52+
bool isHomogeneousAggregateBaseType(QualType Ty) const override {
53+
return true;
54+
}
55+
bool isHomogeneousAggregateSmallEnough(const Type *Base,
56+
uint64_t Members) const override {
57+
uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
58+
59+
// Homogeneous Aggregates may occupy at most 16 registers.
60+
return Members * NumRegs <= MaxNumRegsForArgsRet;
61+
}
62+
63+
// Coerce HIP scalar pointer arguments from generic pointers to global ones.
64+
llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
65+
unsigned ToAS) const;
66+
3667
ABIArgInfo classifyReturnType(QualType RetTy) const;
3768
ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
3869
ABIArgInfo classifyArgumentType(QualType Ty) const;
70+
71+
public:
72+
AMDGCNSPIRVABIInfo(CodeGenTypes &CGT) : SPIRVABIInfo(CGT) {}
73+
void computeInfo(CGFunctionInfo &FI) const override;
74+
75+
llvm::FixedVectorType *
76+
getOptimalVectorMemoryType(llvm::FixedVectorType *Ty,
77+
const LangOptions &LangOpt) const override;
3978
};
4079
} // end anonymous namespace
4180
namespace {
@@ -81,7 +120,10 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
81120
class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
82121
public:
83122
SPIRVTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
84-
: CommonSPIRTargetCodeGenInfo(std::make_unique<SPIRVABIInfo>(CGT)) {}
123+
: CommonSPIRTargetCodeGenInfo(
124+
(CGT.getTarget().getTriple().getVendor() == llvm::Triple::AMD)
125+
? std::make_unique<AMDGCNSPIRVABIInfo>(CGT)
126+
: std::make_unique<SPIRVABIInfo>(CGT)) {}
85127
void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
86128
LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
87129
const VarDecl *D) const override;
@@ -130,25 +172,6 @@ void CommonSPIRABIInfo::setCCs() {
130172
RuntimeCC = llvm::CallingConv::SPIR_FUNC;
131173
}
132174

133-
ABIArgInfo SPIRVABIInfo::classifyReturnType(QualType RetTy) const {
134-
if (getTarget().getTriple().getVendor() != llvm::Triple::AMD)
135-
return DefaultABIInfo::classifyReturnType(RetTy);
136-
if (!isAggregateTypeForABI(RetTy) || getRecordArgABI(RetTy, getCXXABI()))
137-
return DefaultABIInfo::classifyReturnType(RetTy);
138-
139-
if (const auto *RD = RetTy->getAsRecordDecl();
140-
RD && RD->hasFlexibleArrayMember())
141-
return DefaultABIInfo::classifyReturnType(RetTy);
142-
143-
// TODO: The AMDGPU ABI is non-trivial to represent in SPIR-V; in order to
144-
// avoid encoding various architecture specific bits here we return everything
145-
// as direct to retain type info for things like aggregates, for later perusal
146-
// when translating back to LLVM/lowering in the BE. This is also why we
147-
// disable flattening as the outcomes can mismatch between SPIR-V and AMDGPU.
148-
// This will be revisited / optimised in the future.
149-
return ABIArgInfo::getDirect(CGT.ConvertType(RetTy), 0u, nullptr, false);
150-
}
151-
152175
ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
153176
if (getContext().getLangOpts().isTargetDevice()) {
154177
// Coerce pointer arguments with default address space to CrossWorkGroup
@@ -165,18 +188,6 @@ ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
165188
}
166189

167190
if (isAggregateTypeForABI(Ty)) {
168-
if (getTarget().getTriple().getVendor() == llvm::Triple::AMD)
169-
// TODO: The AMDGPU kernel ABI passes aggregates byref, which is not
170-
// currently expressible in SPIR-V; SPIR-V passes aggregates byval,
171-
// which the AMDGPU kernel ABI does not allow. Passing aggregates as
172-
// direct works around this impedance mismatch, as it retains type info
173-
// and can be correctly handled, post reverse-translation, by the AMDGPU
174-
// BE, which has to support this CC for legacy OpenCL purposes. It can
175-
// be brittle and does lead to performance degradation in certain
176-
// pathological cases. This will be revisited / optimised in the future,
177-
// once a way to deal with the byref/byval impedance mismatch is
178-
// identified.
179-
return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
180191
// Force copying aggregate type in kernel arguments by value when
181192
// compiling CUDA targeting SPIR-V. This is required for the object
182193
// copied to be valid on the device.
@@ -191,49 +202,238 @@ ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
191202
return classifyArgumentType(Ty);
192203
}
193204

194-
ABIArgInfo SPIRVABIInfo::classifyArgumentType(QualType Ty) const {
195-
if (getTarget().getTriple().getVendor() != llvm::Triple::AMD)
196-
return DefaultABIInfo::classifyArgumentType(Ty);
197-
if (!isAggregateTypeForABI(Ty))
198-
return DefaultABIInfo::classifyArgumentType(Ty);
205+
void SPIRVABIInfo::computeInfo(CGFunctionInfo &FI) const {
206+
// The logic is same as in DefaultABIInfo with an exception on the kernel
207+
// arguments handling.
208+
llvm::CallingConv::ID CC = FI.getCallingConvention();
209+
210+
if (!getCXXABI().classifyReturnType(FI))
211+
FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
212+
213+
for (auto &I : FI.arguments()) {
214+
if (CC == llvm::CallingConv::SPIR_KERNEL) {
215+
I.info = classifyKernelArgumentType(I.type);
216+
} else {
217+
I.info = classifyArgumentType(I.type);
218+
}
219+
}
220+
}
221+
222+
unsigned AMDGCNSPIRVABIInfo::numRegsForType(QualType Ty) const {
223+
// This duplicates the AMDGPUABI computation.
224+
unsigned NumRegs = 0;
225+
226+
if (const VectorType *VT = Ty->getAs<VectorType>()) {
227+
// Compute from the number of elements. The reported size is based on the
228+
// in-memory size, which includes the padding 4th element for 3-vectors.
229+
QualType EltTy = VT->getElementType();
230+
unsigned EltSize = getContext().getTypeSize(EltTy);
231+
232+
// 16-bit element vectors should be passed as packed.
233+
if (EltSize == 16)
234+
return (VT->getNumElements() + 1) / 2;
235+
236+
unsigned EltNumRegs = (EltSize + 31) / 32;
237+
return EltNumRegs * VT->getNumElements();
238+
}
239+
240+
if (const auto *RD = Ty->getAsRecordDecl()) {
241+
assert(!RD->hasFlexibleArrayMember());
242+
243+
for (const FieldDecl *Field : RD->fields()) {
244+
QualType FieldTy = Field->getType();
245+
NumRegs += numRegsForType(FieldTy);
246+
}
247+
248+
return NumRegs;
249+
}
250+
251+
return (getContext().getTypeSize(Ty) + 31) / 32;
252+
}
253+
254+
llvm::Type *AMDGCNSPIRVABIInfo::coerceKernelArgumentType(llvm::Type *Ty,
255+
unsigned FromAS,
256+
unsigned ToAS) const {
257+
// Single value types.
258+
auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
259+
if (PtrTy && PtrTy->getAddressSpace() == FromAS)
260+
return llvm::PointerType::get(Ty->getContext(), ToAS);
261+
return Ty;
262+
}
263+
264+
ABIArgInfo AMDGCNSPIRVABIInfo::classifyReturnType(QualType RetTy) const {
265+
if (!isAggregateTypeForABI(RetTy) || getRecordArgABI(RetTy, getCXXABI()))
266+
return DefaultABIInfo::classifyReturnType(RetTy);
267+
268+
// Ignore empty structs/unions.
269+
if (isEmptyRecord(getContext(), RetTy, true))
270+
return ABIArgInfo::getIgnore();
271+
272+
// Lower single-element structs to just return a regular value.
273+
if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
274+
return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
275+
276+
if (const auto *RD = RetTy->getAsRecordDecl();
277+
RD && RD->hasFlexibleArrayMember())
278+
return DefaultABIInfo::classifyReturnType(RetTy);
279+
280+
// Pack aggregates <= 4 bytes into single VGPR or pair.
281+
uint64_t Size = getContext().getTypeSize(RetTy);
282+
if (Size <= 16)
283+
return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
284+
285+
if (Size <= 32)
286+
return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
287+
288+
// TODO: This carried over from AMDGPU oddity, we retain it to
289+
// ensure consistency, but it might be reasonable to return Int64.
290+
if (Size <= 64) {
291+
llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
292+
return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
293+
}
294+
295+
if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
296+
return ABIArgInfo::getDirect();
297+
return DefaultABIInfo::classifyReturnType(RetTy);
298+
}
299+
300+
/// For kernels all parameters are really passed in a special buffer. It doesn't
301+
/// make sense to pass anything byval, so everything must be direct.
302+
ABIArgInfo AMDGCNSPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
303+
Ty = useFirstFieldIfTransparentUnion(Ty);
304+
305+
// TODO: Can we omit empty structs?
306+
307+
if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
308+
Ty = QualType(SeltTy, 0);
309+
310+
llvm::Type *OrigLTy = CGT.ConvertType(Ty);
311+
llvm::Type *LTy = OrigLTy;
312+
if (getContext().getLangOpts().isTargetDevice()) {
313+
LTy = coerceKernelArgumentType(
314+
OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
315+
/*ToAS=*/getContext().getTargetAddressSpace(LangAS::opencl_global));
316+
}
317+
318+
// FIXME: This doesn't apply the optimization of coercing pointers in structs
319+
// to global address space when using byref. This would require implementing a
320+
// new kind of coercion of the in-memory type when for indirect arguments.
321+
if (LTy == OrigLTy && isAggregateTypeForABI(Ty)) {
322+
return ABIArgInfo::getIndirectAliased(
323+
getContext().getTypeAlignInChars(Ty),
324+
getContext().getTargetAddressSpace(LangAS::opencl_constant),
325+
false /*Realign*/, nullptr /*Padding*/);
326+
}
327+
328+
// TODO: inhibiting flattening is an AMDGPU workaround for Clover, which might
329+
// be vestigial and should be revisited.
330+
return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
331+
}
332+
333+
ABIArgInfo AMDGCNSPIRVABIInfo::classifyArgumentType(QualType Ty) const {
334+
assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
335+
336+
Ty = useFirstFieldIfTransparentUnion(Ty);
337+
338+
// TODO: support for variadics.
339+
340+
if (!isAggregateTypeForABI(Ty)) {
341+
ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);
342+
if (!ArgInfo.isIndirect()) {
343+
unsigned NumRegs = numRegsForType(Ty);
344+
NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
345+
}
346+
347+
return ArgInfo;
348+
}
199349

200350
// Records with non-trivial destructors/copy-constructors should not be
201351
// passed by value.
202352
if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
203353
return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
204354
RAA == CGCXXABI::RAA_DirectInMemory);
205355

356+
// Ignore empty structs/unions.
357+
if (isEmptyRecord(getContext(), Ty, true))
358+
return ABIArgInfo::getIgnore();
359+
360+
// Lower single-element structs to just pass a regular value. TODO: We
361+
// could do reasonable-size multiple-element structs too, using getExpand(),
362+
// though watch out for things like bitfields.
363+
if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
364+
return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
365+
206366
if (const auto *RD = Ty->getAsRecordDecl();
207367
RD && RD->hasFlexibleArrayMember())
208368
return DefaultABIInfo::classifyArgumentType(Ty);
209369

210-
return ABIArgInfo::getDirect(CGT.ConvertType(Ty), 0u, nullptr, false);
370+
uint64_t Size = getContext().getTypeSize(Ty);
371+
if (Size <= 64) {
372+
// Pack aggregates <= 8 bytes into single VGPR or pair.
373+
unsigned NumRegs = (Size + 31) / 32;
374+
NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
375+
376+
if (Size <= 16)
377+
return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
378+
379+
if (Size <= 32)
380+
return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
381+
382+
// TODO: This is an AMDGPU oddity, and might be vestigial, we retain it to
383+
// ensure consistency, but it should be revisited.
384+
llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
385+
return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
386+
}
387+
388+
if (NumRegsLeft > 0) {
389+
unsigned NumRegs = numRegsForType(Ty);
390+
if (NumRegsLeft >= NumRegs) {
391+
NumRegsLeft -= NumRegs;
392+
return ABIArgInfo::getDirect();
393+
}
394+
}
395+
396+
// Use pass-by-reference in stead of pass-by-value for struct arguments in
397+
// function ABI.
398+
return ABIArgInfo::getIndirectAliased(
399+
getContext().getTypeAlignInChars(Ty),
400+
getContext().getTargetAddressSpace(LangAS::opencl_private));
211401
}
212402

213-
void SPIRVABIInfo::computeInfo(CGFunctionInfo &FI) const {
214-
// The logic is same as in DefaultABIInfo with an exception on the kernel
215-
// arguments handling.
403+
void AMDGCNSPIRVABIInfo::computeInfo(CGFunctionInfo &FI) const {
216404
llvm::CallingConv::ID CC = FI.getCallingConvention();
217405

218406
if (!getCXXABI().classifyReturnType(FI))
219407
FI.getReturnInfo() = classifyReturnType(FI.getReturnType());
220408

409+
NumRegsLeft = MaxNumRegsForArgsRet;
221410
for (auto &I : FI.arguments()) {
222-
if (CC == llvm::CallingConv::SPIR_KERNEL) {
411+
if (CC == llvm::CallingConv::SPIR_KERNEL)
223412
I.info = classifyKernelArgumentType(I.type);
224-
} else {
413+
else
225414
I.info = classifyArgumentType(I.type);
226-
}
227415
}
228416
}
229417

418+
llvm::FixedVectorType *AMDGCNSPIRVABIInfo::getOptimalVectorMemoryType(
419+
llvm::FixedVectorType *Ty, const LangOptions &LangOpt) const {
420+
// AMDGPU has legal instructions for 96-bit so 3x32 can be supported.
421+
if (Ty->getNumElements() == 3 && getDataLayout().getTypeSizeInBits(Ty) == 96)
422+
return Ty;
423+
return DefaultABIInfo::getOptimalVectorMemoryType(Ty, LangOpt);
424+
}
425+
230426
namespace clang {
231427
namespace CodeGen {
232428
void computeSPIRKernelABIInfo(CodeGenModule &CGM, CGFunctionInfo &FI) {
233-
if (CGM.getTarget().getTriple().isSPIRV())
234-
SPIRVABIInfo(CGM.getTypes()).computeInfo(FI);
235-
else
429+
if (CGM.getTarget().getTriple().isSPIRV()) {
430+
if (CGM.getTarget().getTriple().getVendor() == llvm::Triple::AMD)
431+
AMDGCNSPIRVABIInfo(CGM.getTypes()).computeInfo(FI);
432+
else
433+
SPIRVABIInfo(CGM.getTypes()).computeInfo(FI);
434+
} else {
236435
CommonSPIRABIInfo(CGM.getTypes()).computeInfo(FI);
436+
}
237437
}
238438
}
239439
}

0 commit comments

Comments
 (0)