diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h index a297d3d8f8498..6958a86c37cae 100644 --- a/llvm/include/llvm/CodeGen/MachineMemOperand.h +++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h @@ -50,27 +50,28 @@ struct MachinePointerInfo { uint8_t StackID; + const Value *OrgV; + explicit MachinePointerInfo(const Value *v, int64_t offset = 0, - uint8_t ID = 0) - : V(v), Offset(offset), StackID(ID) { + uint8_t ID = 0, const Value *orgv = nullptr) + : V(v), Offset(offset), StackID(ID), OrgV(orgv) { AddrSpace = v ? v->getType()->getPointerAddressSpace() : 0; } explicit MachinePointerInfo(const PseudoSourceValue *v, int64_t offset = 0, uint8_t ID = 0) - : V(v), Offset(offset), StackID(ID) { + : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) { AddrSpace = v ? v->getAddressSpace() : 0; } explicit MachinePointerInfo(unsigned AddressSpace = 0, int64_t offset = 0) : V((const Value *)nullptr), Offset(offset), AddrSpace(AddressSpace), - StackID(0) {} + StackID(0), OrgV((const Value *)nullptr) {} explicit MachinePointerInfo( - PointerUnion v, - int64_t offset = 0, - uint8_t ID = 0) - : V(v), Offset(offset), StackID(ID) { + PointerUnion v, + int64_t offset = 0, uint8_t ID = 0) + : V(v), Offset(offset), StackID(ID), OrgV((const Value *)nullptr) { if (V) { if (const auto *ValPtr = dyn_cast_if_present(V)) AddrSpace = ValPtr->getType()->getPointerAddressSpace(); @@ -83,7 +84,8 @@ struct MachinePointerInfo { if (V.isNull()) return MachinePointerInfo(AddrSpace, Offset + O); if (isa(V)) - return MachinePointerInfo(cast(V), Offset + O, StackID); + return MachinePointerInfo(cast(V), Offset + O, StackID, + OrgV); return MachinePointerInfo(cast(V), Offset + O, StackID); } diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp index 0d251697f2567..6f10ab39a8ab8 100644 --- a/llvm/lib/CodeGen/MachineOperand.cpp +++ b/llvm/lib/CodeGen/MachineOperand.cpp @@ -1050,7 +1050,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C, return isDereferenceableAndAlignedPointer( BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL, - dyn_cast(BasePtr)); + dyn_cast(OrgV ? OrgV : BasePtr)); } /// getConstantPool - Return a MachinePointerInfo record that refers to the diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 04d6fd5f48cc3..34d020f42e37c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4562,10 +4562,41 @@ static std::optional getRange(const Instruction &I) { return std::nullopt; } +static void tryToImproveAlign(const DataLayout &DL, Type *Ty, Align &Alignment, + const Value *&PtrV, const Value *&CxtI, + int64_t &Offset) { + Align PrefAlign = DL.getPrefTypeAlign(Ty); + if (auto *GEP = dyn_cast(PtrV); + GEP && PrefAlign > Alignment && PrefAlign.previous() > Alignment) { + const Value *BasePtrV = GEP->getPointerOperand(); + APInt OffsetAccumulated = + APInt(DL.getIndexTypeSizeInBits(GEP->getType()), 0); + if (GEP->accumulateConstantOffset(DL, OffsetAccumulated)) { + KnownBits Known = computeKnownBits(PtrV, DL); + KnownBits SplitKnown = + KnownBits::add(Known, KnownBits::makeConstant(APInt( + Known.getBitWidth(), Alignment.value()))); + unsigned TrailZ = std::min(SplitKnown.countMinTrailingZeros(), + +Value::MaxAlignmentExponent); + Align ExpandAlign = + Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ)); + Align BaseAlignment = + getKnownAlignment(const_cast(BasePtrV), DL, GEP); + if (ExpandAlign > Alignment) { + CxtI = PtrV; + PtrV = BasePtrV; + Alignment = BaseAlignment; + Offset = OffsetAccumulated.getSExtValue(); + } + } + } +} + void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (I.isAtomic()) return visitAtomicLoad(I); + const DataLayout &DL = DAG.getDataLayout(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const Value *SV = I.getOperand(0); if (TLI.supportSwiftError()) { @@ -4587,7 +4618,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { Type *Ty = I.getType(); SmallVector ValueVTs, MemVTs; SmallVector Offsets; - ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &MemVTs, &Offsets); + ComputeValueVTs(TLI, DL, Ty, ValueVTs, &MemVTs, &Offsets); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; @@ -4597,7 +4628,12 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { const MDNode *Ranges = getRangeMetadata(I); bool isVolatile = I.isVolatile(); MachineMemOperand::Flags MMOFlags = - TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo); + TLI.getLoadMemOperandFlags(I, DL, AC, LibInfo); + + // See visitStore comments. + int64_t Offset = 0; + const Value *CxtI = nullptr; + tryToImproveAlign(DL, Ty, Alignment, SV, CxtI, Offset); SDValue Root; bool ConstantMemory = false; @@ -4647,7 +4683,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { // TODO: MachinePointerInfo only supports a fixed length offset. MachinePointerInfo PtrInfo = !Offsets[i].isScalable() || Offsets[i].isZero() - ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue()) + ? MachinePointerInfo(SV, Offsets[i].getKnownMinValue() + Offset, 0, + CxtI) : MachinePointerInfo(); SDValue A = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]); @@ -4734,6 +4771,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { if (I.isAtomic()) return visitAtomicStore(I); + const DataLayout &DL = DAG.getDataLayout(); const Value *SrcV = I.getOperand(0); const Value *PtrV = I.getOperand(1); @@ -4754,8 +4792,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { SmallVector ValueVTs, MemVTs; SmallVector Offsets; - ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), - SrcV->getType(), ValueVTs, &MemVTs, &Offsets); + ComputeValueVTs(DAG.getTargetLoweringInfo(), DL, SrcV->getType(), ValueVTs, + &MemVTs, &Offsets); unsigned NumValues = ValueVTs.size(); if (NumValues == 0) return; @@ -4772,7 +4810,19 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { Align Alignment = I.getAlign(); AAMDNodes AAInfo = I.getAAMetadata(); - auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout()); + // refine MPI: V + Offset + // Example: + // align 4 %p + // %gep = getelementptr i8, ptr %p, i32 1 + // store i32 %v, ptr %len, align 1 + // -> + // MPI: V = %p, Offset = 1 + // SDNode: store<(store (s32) into %p + 1, align 1, basealign 4)> + int64_t Offset = 0; + const Value *CxtI = nullptr; + tryToImproveAlign(DL, SrcV->getType(), Alignment, PtrV, CxtI, Offset); + + auto MMOFlags = TLI.getStoreMemOperandFlags(I, DL); unsigned ChainI = 0; for (unsigned i = 0; i != NumValues; ++i, ++ChainI) { @@ -4787,7 +4837,8 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) { // TODO: MachinePointerInfo only supports a fixed length offset. MachinePointerInfo PtrInfo = !Offsets[i].isScalable() || Offsets[i].isZero() - ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue()) + ? MachinePointerInfo(PtrV, Offsets[i].getKnownMinValue() + Offset, + 0, CxtI) : MachinePointerInfo(); SDValue Add = DAG.getObjectPtrOffset(dl, Ptr, Offsets[i]); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 000f8cc6786a5..7f3983db095d2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10370,6 +10370,59 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { assert(LoadedVT.isInteger() && !LoadedVT.isVector() && "Unaligned load of unsupported type."); + Align BaseAlignment = LD->getBaseAlign(); + Align Alignment = LD->getAlign(); + + // Divide the load according to the latest align information + if (commonAlignment(BaseAlignment, + Alignment.value() + LD->getPointerInfo().Offset) > + Alignment) { + ISD::LoadExtType HiExtType = LD->getExtensionType(); + + // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD. + if (HiExtType == ISD::NON_EXTLOAD) + HiExtType = ISD::ZEXTLOAD; + + bool IsLE = DAG.getDataLayout().isLittleEndian(); + unsigned NumBytes = LoadedVT.getSizeInBits() / 8; + // LE/BE use the same initial Alignment + unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value()); + unsigned RemainderBytes = NumBytes; + SDValue Result = DAG.getConstant(0, dl, VT); + SmallVector Chains; + while (RemainderBytes) { + unsigned CurrBytes = + std::min(1ul << Log2_32(RemainderBytes), Alignment.value()); + ISD::LoadExtType ExtType = ISD::ZEXTLOAD; + if (RemainderBytes + CurrBytes == NumBytes) + ExtType = HiExtType; + + SDValue CurrLD = DAG.getExtLoad( + ExtType, dl, VT, Chain, + DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)), + LD->getPointerInfo().getWithOffset(PtrOffset), + EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); + if (IsLE) + Chains.push_back(CurrLD.getValue(1)); + else + Chains.insert(Chains.begin(), CurrLD.getValue(1)); + SDValue CurrV = DAG.getNode( + ISD::SHL, dl, VT, CurrLD, + DAG.getShiftAmountConstant((NumBytes - RemainderBytes) * 8, VT, dl)); + Result = DAG.getNode(ISD::OR, dl, VT, CurrV, Result); + RemainderBytes -= CurrBytes; + if (RemainderBytes == 0) + break; + Alignment = commonAlignment(BaseAlignment, + LD->getPointerInfo().Offset + PtrOffset + + (IsLE ? CurrBytes : -CurrBytes)); + PtrOffset = + IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value(); + } + SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + return std::make_pair(Result, TF); + } // Compute the new VT that is half the size of the old one. This is an // integer MVT. unsigned NumBits = LoadedVT.getSizeInBits(); @@ -10377,7 +10430,6 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2); NumBits >>= 1; - Align Alignment = LD->getBaseAlign(); unsigned IncrementSize = NumBits / 8; ISD::LoadExtType HiExtType = LD->getExtensionType(); @@ -10389,24 +10441,24 @@ TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const { SDValue Lo, Hi; if (DAG.getDataLayout().isLittleEndian()) { Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(), - NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), - LD->getAAInfo()); + NewLoadedVT, BaseAlignment, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), - LD->getAAInfo()); + NewLoadedVT, BaseAlignment, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); } else { Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(), - NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), - LD->getAAInfo()); + NewLoadedVT, BaseAlignment, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize), - NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(), - LD->getAAInfo()); + NewLoadedVT, BaseAlignment, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); } // aggregate the two parts @@ -10428,7 +10480,8 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, SDValue Ptr = ST->getBasePtr(); SDValue Val = ST->getValue(); EVT VT = Val.getValueType(); - Align Alignment = ST->getBaseAlign(); + Align BaseAlignment = ST->getBaseAlign(); + Align Alignment = ST->getAlign(); auto &MF = DAG.getMachineFunction(); EVT StoreMemVT = ST->getMemoryVT(); @@ -10447,7 +10500,7 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, // FIXME: Does not handle truncating floating point stores! SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val); Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(), - Alignment, ST->getMemOperand()->getFlags()); + BaseAlignment, ST->getMemOperand()->getFlags()); return Result; } // Do a (aligned) store to a stack slot, then copy from the stack slot @@ -10515,6 +10568,47 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, assert(StoreMemVT.isInteger() && !StoreMemVT.isVector() && "Unaligned store of unknown type."); + + // Divide the store value according to the latest align information + if (commonAlignment(BaseAlignment, + Alignment.value() + ST->getPointerInfo().Offset) > + Alignment) { + bool IsLE = DAG.getDataLayout().isLittleEndian(); + unsigned NumBytes = StoreMemVT.getFixedSizeInBits() / 8; + SmallVector Stores; + // LE/BE use the same initial Alignment + unsigned PtrOffset = IsLE ? 0 : (NumBytes - Alignment.value()); + unsigned RemainderBytes = NumBytes; + while (RemainderBytes) { + unsigned CurrBytes = + std::min(1ul << Log2_32(RemainderBytes), Alignment.value()); + SDValue CurrST = DAG.getTruncStore( + Chain, dl, Val, + DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(PtrOffset)), + ST->getPointerInfo().getWithOffset(PtrOffset), + EVT::getIntegerVT(*DAG.getContext(), CurrBytes * 8), BaseAlignment, + ST->getMemOperand()->getFlags(), ST->getAAInfo()); + if (IsLE) + Stores.push_back(CurrST); + else + Stores.insert(Stores.begin(), CurrST); + RemainderBytes -= CurrBytes; + if (RemainderBytes == 0) + break; + + Val = DAG.getNode(ISD::SRL, dl, VT, Val, + DAG.getShiftAmountConstant(CurrBytes * 8, VT, dl)); + Alignment = commonAlignment(BaseAlignment, + ST->getPointerInfo().Offset + PtrOffset + + (IsLE ? CurrBytes : -CurrBytes)); + PtrOffset = + IsLE ? NumBytes - RemainderBytes : RemainderBytes - Alignment.value(); + } + + SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores); + return Result; + } + // Get the half-size VT EVT NewStoredVT = StoreMemVT.getHalfSizedIntegerVT(*DAG.getContext()); unsigned NumBits = NewStoredVT.getFixedSizeInBits(); @@ -10538,17 +10632,18 @@ SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST, SDValue Store1, Store2; Store1 = DAG.getTruncStore(Chain, dl, DAG.getDataLayout().isLittleEndian() ? Lo : Hi, - Ptr, ST->getPointerInfo(), NewStoredVT, Alignment, - ST->getMemOperand()->getFlags()); + Ptr, ST->getPointerInfo(), NewStoredVT, + BaseAlignment, ST->getMemOperand()->getFlags()); Ptr = DAG.getObjectPtrOffset(dl, Ptr, TypeSize::getFixed(IncrementSize)); Store2 = DAG.getTruncStore( Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr, - ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment, - ST->getMemOperand()->getFlags(), ST->getAAInfo()); + ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, + BaseAlignment, ST->getMemOperand()->getFlags(), ST->getAAInfo()); SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2); + return Result; } diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 509ba295ea7f7..55e695bc7b9bc 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -14,8 +14,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $sgpr17 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) + ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.f1.kernarg.segment + 24, align 8, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.f1.kernarg.segment + 40, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll index 3303cb86c874e..557aa8f35001f 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -113,7 +113,7 @@ define amdgpu_kernel void @uniform_trunc_i64_to_i1(ptr addrspace(1) %out, i64 %x ; GCN-NEXT: liveins: $sgpr4_sgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 - ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.out.kernarg.offset, align 4, addrspace 4) + ; GCN-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s128) from %ir.uniform_trunc_i64_to_i1.kernarg.segment + 36, align 4, basealign 16, addrspace 4) ; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 13, 0 :: (dereferenceable invariant load (s32) from %ir.z.kernarg.offset.align.down, addrspace 4) ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub1 ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX4_IMM]].sub0 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index a18b5b5396f63..18ec2144f13d4 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -4708,13 +4708,12 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; SI: ; %bb.0: ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53 ; SI-NEXT: s_load_dword s2, s[4:5], 0x9 ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa ; SI-NEXT: buffer_load_ubyte v4, off, s[4:7], 0 offset:49 -; SI-NEXT: buffer_load_ubyte v5, off, s[4:7], 0 offset:50 -; SI-NEXT: buffer_load_ubyte v6, off, s[4:7], 0 offset:51 -; SI-NEXT: buffer_load_ubyte v7, off, s[4:7], 0 offset:52 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:53 +; SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:50 +; SI-NEXT: s_load_dword s3, s[4:5], 0xd ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v2, s2 @@ -4725,11 +4724,9 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 +; SI-NEXT: s_lshl_b32 s0, s3, 24 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, s0, v2 ; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -4741,46 +4738,39 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, ; VI-NEXT: s_add_u32 s0, s4, 49 ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: s_add_u32 s2, s4, 50 -; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_add_u32 s0, s0, 3 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_add_u32 s0, s4, 51 -; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: s_addc_u32 s3, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_load_dword s0, s[4:5], 0x34 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v6, s0 -; VI-NEXT: flat_load_ubyte v8, v[0:1] -; VI-NEXT: flat_load_ubyte v9, v[2:3] -; VI-NEXT: flat_load_ubyte v10, v[4:5] -; VI-NEXT: flat_load_ubyte v6, v[6:7] +; VI-NEXT: flat_load_ushort v4, v[0:1] +; VI-NEXT: flat_load_ubyte v2, v[2:3] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s2, s0, 24 ; VI-NEXT: s_add_u32 s0, s4, 53 ; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_load_dword s2, s[4:5], 0x24 +; VI-NEXT: s_load_dword s3, s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x28 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_e32 v2, v3, v2 +; VI-NEXT: v_or_b32_e32 v6, s2, v2 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s2 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v7 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5] ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; VI-NEXT: v_or_b32_e32 v4, v4, v9 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v10 -; VI-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v4, v5, v4 -; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: flat_store_dword v[2:3], v6 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll new file mode 100644 index 0000000000000..721ef95a21866 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store-with-aligned.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s + +define void @store_b32_basealign2_offset1(ptr align 2 %p, i32 %v) { +; CHECK-LABEL: store_b32_basealign2_offset1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srli a2, a1, 24 +; CHECK-NEXT: srli a3, a1, 8 +; CHECK-NEXT: sb a1, 1(a0) +; CHECK-NEXT: sh a3, 2(a0) +; CHECK-NEXT: sb a2, 4(a0) +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 1 + store i32 %v, ptr %len, align 1 + ret void +} + +define void @store_b32_basealign2_offset3(ptr align 2 %p, i32 %v) { +; CHECK-LABEL: store_b32_basealign2_offset3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: srli a2, a1, 24 +; CHECK-NEXT: srli a3, a1, 8 +; CHECK-NEXT: sb a1, 3(a0) +; CHECK-NEXT: sh a3, 4(a0) +; CHECK-NEXT: sb a2, 6(a0) +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + store i32 %v, ptr %len, align 1 + ret void +} + +define void @store_b64_basealign4_offset1(ptr align 4 %p) { +; CHECK-LABEL: store_b64_basealign4_offset1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sb zero, 1(a0) +; CHECK-NEXT: sh zero, 2(a0) +; CHECK-NEXT: sw zero, 4(a0) +; CHECK-NEXT: sb zero, 8(a0) +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 1 + store i64 0, ptr %len, align 1 + ret void +} + +define void @store_b64_basealign4_offset2(ptr align 4 %p) { +; CHECK-LABEL: store_b64_basealign4_offset2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: sh zero, 2(a0) +; CHECK-NEXT: sw zero, 4(a0) +; CHECK-NEXT: sh zero, 8(a0) +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 2 + store i64 0, ptr %len, align 2 + ret void +} + +define i32 @load_b32_base_align2_offset1(ptr align 2 %p) { +; CHECK-LABEL: load_b32_base_align2_offset1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lhu a1, 2(a0) +; CHECK-NEXT: lbu a2, 1(a0) +; CHECK-NEXT: lbu a0, 4(a0) +; CHECK-NEXT: slli a1, a1, 8 +; CHECK-NEXT: or a1, a1, a2 +; CHECK-NEXT: slli a0, a0, 24 +; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 1 + %v = load i32, ptr %len, align 1 + ret i32 %v +} + +define i32 @load_b32_base_align2_offset3(ptr align 2 %p) { +; CHECK-LABEL: load_b32_base_align2_offset3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lhu a1, 4(a0) +; CHECK-NEXT: lbu a2, 3(a0) +; CHECK-NEXT: lbu a0, 6(a0) +; CHECK-NEXT: slli a1, a1, 8 +; CHECK-NEXT: or a1, a1, a2 +; CHECK-NEXT: slli a0, a0, 24 +; CHECK-NEXT: or a0, a0, a1 +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + %v = load i32, ptr %len, align 1 + ret i32 %v +} + +define i64 @load_b64_base_align2_offset1(ptr align 4 %p) { +; CHECK-LABEL: load_b64_base_align2_offset1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lbu a1, 3(a0) +; CHECK-NEXT: lbu a2, 4(a0) +; CHECK-NEXT: lbu a3, 5(a0) +; CHECK-NEXT: lbu a4, 2(a0) +; CHECK-NEXT: slli a1, a1, 8 +; CHECK-NEXT: slli a2, a2, 16 +; CHECK-NEXT: slli a3, a3, 24 +; CHECK-NEXT: or a1, a1, a4 +; CHECK-NEXT: or a2, a3, a2 +; CHECK-NEXT: lbu a3, 7(a0) +; CHECK-NEXT: lbu a4, 6(a0) +; CHECK-NEXT: lbu a5, 8(a0) +; CHECK-NEXT: lbu a0, 9(a0) +; CHECK-NEXT: slli a3, a3, 8 +; CHECK-NEXT: or a3, a3, a4 +; CHECK-NEXT: slli a5, a5, 16 +; CHECK-NEXT: slli a0, a0, 24 +; CHECK-NEXT: or a5, a0, a5 +; CHECK-NEXT: or a0, a2, a1 +; CHECK-NEXT: or a1, a5, a3 +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 2 + %v = load i64, ptr %len, align 1 + ret i64 %v +} + +define i64 @load_b64_base_align2_offset2(ptr align 4 %p) { +; CHECK-LABEL: load_b64_base_align2_offset2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lhu a1, 4(a0) +; CHECK-NEXT: lhu a2, 2(a0) +; CHECK-NEXT: lhu a3, 8(a0) +; CHECK-NEXT: lhu a4, 6(a0) +; CHECK-NEXT: slli a0, a1, 16 +; CHECK-NEXT: or a0, a0, a2 +; CHECK-NEXT: slli a1, a3, 16 +; CHECK-NEXT: or a1, a1, a4 +; CHECK-NEXT: ret +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 2 + %v = load i64, ptr %len, align 2 + ret i64 %v +}