Skip to content

[SelectionDAG] Optimize unaligned load stores to realign after offset #145309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions llvm/include/llvm/CodeGen/MachineMemOperand.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,28 @@ struct MachinePointerInfo {

uint8_t StackID;

const Value *OrgV;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this need OrgV?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nikic
In the isDereferenceableAndAlignedPointer(
const Value *V, Align Alignment, const APInt &Size, const DataLayout &DL,
const Instruction *CtxI, AssumptionCache *AC, const DominatorTree *DT,
const TargetLibraryInfo *TLI) function, V and CtxI are passed in, and V is re-adjusted to a gep pointeroperand for gep V. My current changes make V already a gep pointeroperand, and OrgV is used for CtxI, otherwise isValidAssumeForContext will always return false, because V is always equal to CtxI:

    // Don't let an assume affect itself - this would cause the problems
    // `isEphemeralValueOf` is trying to prevent, and it would also make
    // the loop below go out of bounds.
    if (!AllowEphemerals && Inv == CxtI)
      return false;

demo ir: if we don't use OrgV, Inv and CxtI are all %s_exp_v3f32.kernarg.segment

define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) #0 {
  %s_exp_v3f32.kernarg.segment = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
  %out.kernarg.offset1 = bitcast i8 addrspace(4)* %s_exp_v3f32.kernarg.segment to i8 addrspace(4)*
  %out.kernarg.offset.cast = bitcast i8 addrspace(4)* %out.kernarg.offset1 to ptr addrspace(4), !amdgpu.uniform !0
  %out.load = load ptr addrspace(1), ptr addrspace(4) %out.kernarg.offset.cast, align 16, !invariant.load !0
  %in.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %s_exp_v3f32.kernarg.segment, i64 16
  %in.kernarg.offset.cast = bitcast i8 addrspace(4)* %in.kernarg.offset to <4 x float> addrspace(4)*, !amdgpu.uniform !0
  %1 = load <4 x float>, <4 x float> addrspace(4)* %in.kernarg.offset.cast, align 16, !invariant.load !0
  %in.load = shufflevector <4 x float> %1, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
  %result = call <3 x float> @llvm.exp.v3f32(<3 x float> %in.load)
  store <3 x float> %result, ptr addrspace(1) %out.load, align 16
  ret void
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should be able to use isValidAssumeForContext with AllowEphemerals=true when determining dereferenceability.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, so we can do this because we know for sure that CtxI is not an assume instruction?


explicit MachinePointerInfo(const Value *v, int64_t offset = 0,
uint8_t ID = 0)
: V(v), Offset(offset), StackID(ID) {
uint8_t ID = 0, const Value *orgv = nullptr)
: V(v), Offset(offset), StackID(ID), OrgV(orgv) {
AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0;
}

explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0,
uint8_t ID = 0)
: V(v), Offset(offset), StackID(ID) {
: V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
AddrSpace = v ? v->getAddressSpace() : 0;
}

explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0)
: V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace),
StackID(0) {}
StackID(0), OrgV((const Value *)nullptr) {}

explicit MachinePointerInfo(
PointerUnion<const Value *, const PseudoSourceValue *> v,
int64_t offset = 0,
uint8_t ID = 0)
: V(v), Offset(offset), StackID(ID) {
PointerUnion<const Value *, const PseudoSourceValue *> v,
int64_t offset = 0, uint8_t ID = 0)
: V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) {
if (V) {
if (const auto *ValPtr = dyn_cast_if_present<const Value *>(V))
AddrSpace = ValPtr->getType()->getPointerAddressSpace();
Expand All @@ -83,7 +84,8 @@ struct MachinePointerInfo {
if (V.isNull())
return MachinePointerInfo(AddrSpace, Offset + O);
if (isa<const Value *>(V))
return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID);
return MachinePointerInfo(cast<const Value *>(V), Offset + O, StackID,
OrgV);
return MachinePointerInfo(cast<const PseudoSourceValue *>(V), Offset + O,
StackID);
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/MachineOperand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,

return isDereferenceableAndAlignedPointer(
BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL,
dyn_cast<Instruction>(BasePtr));
dyn_cast<Instruction>(OrgV ? OrgV : BasePtr));
}

/// getConstantPool - Return a MachinePointerInfo record that refers to the
Expand Down
65 changes: 58 additions & 7 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4562,10 +4562,41 @@ static std::optional<ConstantRange> getRange(const Instruction &I) {
return std::nullopt;
}

static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment,
const Value *&PtrV, const Value *&CxtI,
int64_t &Offset) {
Align PrefAlign = DL.getPrefTypeAlign(Ty);
if (auto *GEP = dyn_cast<GetElementPtrInst>(PtrV);
GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) {
const Value *BasePtrV = GEP->getPointerOperand();
APInt OffsetAccumulated =
APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0);
if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) {
KnownBits Known = computeKnownBits(PtrV, DL);
KnownBits SplitKnown =
KnownBits::add(Known, KnownBits::makeConstant(APInt(
Known.getBitWidth(), Alignment.value())));
unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(),
+Value::MaxAlignmentExponent);
Align ExpandAlign =
Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
Align BaseAlignment =
getKnownAlignment(const_cast<Value *>(BasePtrV), DL, GEP);
if (ExpandAlign > Alignment) {
CxtI = PtrV;
PtrV = BasePtrV;
Alignment = BaseAlignment;
Offset = OffsetAccumulated.getSExtValue();
}
}
}
}

void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
if (I.isAtomic())
return visitAtomicLoad(I);

const DataLayout &DL = DAG.getDataLayout();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
const Value *SV = I.getOperand(0);
if (TLI.supportSwiftError()) {
Expand All @@ -4587,7 +4618,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
Type *Ty = I.getType();
SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets);
ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
Expand All @@ -4597,7 +4628,12 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
const MDNode *Ranges = getRangeMetadata(I);
bool isVolatile = I.isVolatile();
MachineMemOperand::Flags MMOFlags =
TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo);
TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo);

// See visitStore comments.
int64_t Offset = 0;
const Value *CxtI = nullptr;
tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset);

SDValue Root;
bool ConstantMemory = false;
Expand Down Expand Up @@ -4647,7 +4683,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
? MachinePointerInfo(SV, Offsets[i].getKnownMinValue())
? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0,
CxtI)
: MachinePointerInfo();

SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
Expand Down Expand Up @@ -4734,6 +4771,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
if (I.isAtomic())
return visitAtomicStore(I);

const DataLayout &DL = DAG.getDataLayout();
const Value *SrcV = I.getOperand(0);
const Value *PtrV = I.getOperand(1);

Expand All @@ -4754,8 +4792,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {

SmallVector<EVT, 4> ValueVTs, MemVTs;
SmallVector<TypeSize, 4> Offsets;
ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
SrcV->getType(), ValueVTs, &MemVTs, &Offsets);
ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs,
&MemVTs, &Offsets);
unsigned NumValues = ValueVTs.size();
if (NumValues == 0)
return;
Expand All @@ -4772,7 +4810,19 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
Align Alignment = I.getAlign();
AAMDNodes AAInfo = I.getAAMetadata();

auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
// refine MPI: V + Offset
// Example:
// align 4 %p
// %gep = getelementptr i8, ptr %p, i32 1
// store i32 %v, ptr %len, align 1
// ->
// MPI: V = %p, Offset = 1
// SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)>
int64_t Offset = 0;
const Value *CxtI = nullptr;
tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset);

auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL);

unsigned ChainI = 0;
for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
Expand All @@ -4787,7 +4837,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
// TODO: MachinePointerInfo only supports a fixed length offset.
MachinePointerInfo PtrInfo =
!Offsets[i].isScalable() || Offsets[i].isZero()
? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue())
? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset,
0, CxtI)
: MachinePointerInfo();

SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]);
Expand Down
125 changes: 110 additions & 15 deletions llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10370,14 +10370,66 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
"Unaligned load of unsupported type.");

Align BaseAlignment = LD->getBaseAlign();
Align Alignment = LD->getAlign();

// Divide the load according to the latest align information
if (commonAlignment(BaseAlignment,
Alignment.value() + LD->getPointerInfo().Offset) >
Alignment) {
ISD::LoadExtType HiExtType = LD->getExtensionType();

// If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
if (HiExtType == ISD::NON_EXTLOAD)
HiExtType = ISD::ZEXTLOAD;

bool IsLE = DAG.getDataLayout().isLittleEndian();
unsigned NumBytes = LoadedVT.getSizeInBits() / 8;
// LE/BE use the same initial Alignment
unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
unsigned RemainderBytes = NumBytes;
SDValue Result = DAG.getConstant(0, dl, VT);
SmallVector<SDValue, 4> Chains;
while (RemainderBytes) {
unsigned CurrBytes =
std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
ISD::LoadExtType ExtType = ISD::ZEXTLOAD;
if (RemainderBytes + CurrBytes == NumBytes)
ExtType = HiExtType;

SDValue CurrLD = DAG.getExtLoad(
ExtType, dl, VT, Chain,
DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
LD->getPointerInfo().getWithOffset(PtrOffset),
EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
if (IsLE)
Chains.push_back(CurrLD.getValue(1));
else
Chains.insert(Chains.begin(), CurrLD.getValue(1));
SDValue CurrV = DAG.getNode(
ISD::SHL, dl, VT, CurrLD,
DAG.getShiftAmountConstant((NumBytes - RemainderBytes) * 8, VT, dl));
Result = DAG.getNode(ISD::OR, dl, VT, CurrV, Result);
RemainderBytes -= CurrBytes;
if (RemainderBytes == 0)
break;
Alignment = commonAlignment(BaseAlignment,
LD->getPointerInfo().Offset + PtrOffset +
(IsLE ? CurrBytes : -CurrBytes));
PtrOffset =
IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
}
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
return std::make_pair(Result, TF);
}
// Compute the new VT that is half the size of the old one. This is an
// integer MVT.
unsigned NumBits = LoadedVT.getSizeInBits();
EVT NewLoadedVT;
NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
NumBits >>= 1;

Align Alignment = LD->getBaseAlign();
unsigned IncrementSize = NumBits / 8;
ISD::LoadExtType HiExtType = LD->getExtensionType();

Expand All @@ -10389,24 +10441,24 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
SDValue Lo, Hi;
if (DAG.getDataLayout().isLittleEndian()) {
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
LD->getAAInfo());
NewLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());

Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
LD->getAAInfo());
NewLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
} else {
Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
LD->getAAInfo());
NewLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());

Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
LD->getPointerInfo().getWithOffset(IncrementSize),
NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
LD->getAAInfo());
NewLoadedVT, BaseAlignment,
LD->getMemOperand()->getFlags(), LD->getAAInfo());
}

// aggregate the two parts
Expand All @@ -10428,7 +10480,8 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Ptr = ST->getBasePtr();
SDValue Val = ST->getValue();
EVT VT = Val.getValueType();
Align Alignment = ST->getBaseAlign();
Align BaseAlignment = ST->getBaseAlign();
Align Alignment = ST->getAlign();
auto &MF = DAG.getMachineFunction();
EVT StoreMemVT = ST->getMemoryVT();

Expand All @@ -10447,7 +10500,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
// FIXME: Does not handle truncating floating point stores!
SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
Alignment, ST->getMemOperand()->getFlags());
BaseAlignment, ST->getMemOperand()->getFlags());
return Result;
}
// Do a (aligned) store to a stack slot, then copy from the stack slot
Expand Down Expand Up @@ -10515,6 +10568,47 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,

assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() &&
"Unaligned store of unknown type.");

// Divide the store value according to the latest align information
if (commonAlignment(BaseAlignment,
Alignment.value() + ST->getPointerInfo().Offset) >
Alignment) {
bool IsLE = DAG.getDataLayout().isLittleEndian();
unsigned NumBytes = StoreMemVT.getFixedSizeInBits() / 8;
SmallVector<SDValue, 8> Stores;
// LE/BE use the same initial Alignment
unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value());
unsigned RemainderBytes = NumBytes;
while (RemainderBytes) {
unsigned CurrBytes =
std::min(1ul << Log2_32(RemainderBytes), Alignment.value());
SDValue CurrST = DAG.getTruncStore(
Chain, dl, Val,
DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)),
ST->getPointerInfo().getWithOffset(PtrOffset),
EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment,
ST->getMemOperand()->getFlags(), ST->getAAInfo());
if (IsLE)
Stores.push_back(CurrST);
else
Stores.insert(Stores.begin(), CurrST);
RemainderBytes -= CurrBytes;
if (RemainderBytes == 0)
break;

Val = DAG.getNode(ISD::SRL, dl, VT, Val,
DAG.getShiftAmountConstant(CurrBytes * 8, VT, dl));
Alignment = commonAlignment(BaseAlignment,
ST->getPointerInfo().Offset + PtrOffset +
(IsLE ? CurrBytes : -CurrBytes));
PtrOffset =
IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value();
}

SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
return Result;
}

// Get the half-size VT
EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext());
unsigned NumBits = NewStoredVT.getFixedSizeInBits();
Expand All @@ -10538,17 +10632,18 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
SDValue Store1, Store2;
Store1 = DAG.getTruncStore(Chain, dl,
DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
ST->getMemOperand()->getFlags());
Ptr, ST->getPointerInfo(), NewStoredVT,
BaseAlignment, ST->getMemOperand()->getFlags());

Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize));
Store2 = DAG.getTruncStore(
Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
ST->getMemOperand()->getFlags(), ST->getAAInfo());
ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT,
BaseAlignment, ST->getMemOperand()->getFlags(), ST->getAAInfo());

SDValue Result =
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);

return Result;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.f1.kernarg.segment + 24, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.f1.kernarg.segment + 40, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x
; GCN-NEXT: liveins: $sgpr4_sgpr5
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.uniform_trunc_i64_to_i1.kernarg.segment + 36, align 4, basealign 16, addrspace 4)
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4)
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0
Expand Down
Loading
Loading