Skip to content

Commit 6270d1a

Browse files
committed
Stride VectorEndPointer for reverse interleaved access
1 parent 0f530f8 commit 6270d1a

File tree

9 files changed

+117
-87
lines changed

9 files changed

+117
-87
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7767,8 +7767,9 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
77677767
(CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
77687768
? GEPNoWrapFlags::none()
77697769
: GEPNoWrapFlags::inBounds();
7770-
VectorPtr = new VPVectorEndPointerRecipe(
7771-
Ptr, &Plan.getVF(), getLoadStoreType(I), Flags, I->getDebugLoc());
7770+
VectorPtr =
7771+
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
7772+
/*Stride*/ -1, Flags, I->getDebugLoc());
77727773
} else {
77737774
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
77747775
GEP ? GEP->getNoWrapFlags()

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,12 +1702,16 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
17021702
public VPUnrollPartAccessor<2> {
17031703
Type *IndexedTy;
17041704

1705+
int64_t Stride;
1706+
17051707
public:
17061708
VPVectorEndPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy,
1707-
GEPNoWrapFlags GEPFlags, DebugLoc DL)
1709+
int64_t Stride, GEPNoWrapFlags GEPFlags, DebugLoc DL)
17081710
: VPRecipeWithIRFlags(VPDef::VPVectorEndPointerSC,
17091711
ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL),
1710-
IndexedTy(IndexedTy) {}
1712+
IndexedTy(IndexedTy), Stride(Stride) {
1713+
assert(Stride != 0 && "Unexpected stride");
1714+
}
17111715

17121716
VP_CLASSOF_IMPL(VPDef::VPVectorEndPointerSC)
17131717

@@ -1739,7 +1743,8 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
17391743

17401744
VPVectorEndPointerRecipe *clone() override {
17411745
return new VPVectorEndPointerRecipe(getOperand(0), getVFValue(), IndexedTy,
1742-
getGEPNoWrapFlags(), getDebugLoc());
1746+
Stride, getGEPNoWrapFlags(),
1747+
getDebugLoc());
17431748
}
17441749

17451750
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2193,31 +2193,34 @@ void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
21932193
}
21942194
#endif
21952195

2196-
static Type *getGEPIndexTy(bool IsScalable, bool IsReverse,
2196+
static Type *getGEPIndexTy(bool IsScalable, bool IsReverse, bool IsUnitStride,
21972197
unsigned CurrentPart, IRBuilderBase &Builder) {
21982198
// Use i32 for the gep index type when the value is constant,
21992199
// or query DataLayout for a more suitable index type otherwise.
22002200
const DataLayout &DL = Builder.GetInsertBlock()->getDataLayout();
2201-
return IsScalable && (IsReverse || CurrentPart > 0)
2201+
return !IsUnitStride || (IsScalable && (IsReverse || CurrentPart > 0))
22022202
? DL.getIndexType(Builder.getPtrTy(0))
22032203
: Builder.getInt32Ty();
22042204
}
22052205

22062206
void VPVectorEndPointerRecipe::execute(VPTransformState &State) {
22072207
auto &Builder = State.Builder;
22082208
unsigned CurrentPart = getUnrollPart(*this);
2209+
bool IsUnitStride = Stride == 1 || Stride == -1;
22092210
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ true,
2210-
CurrentPart, Builder);
2211+
IsUnitStride, CurrentPart, Builder);
22112212

22122213
// The wide store needs to start at the last vector element.
22132214
Value *RunTimeVF = State.get(getVFValue(), VPLane(0));
22142215
if (IndexTy != RunTimeVF->getType())
22152216
RunTimeVF = Builder.CreateZExtOrTrunc(RunTimeVF, IndexTy);
2216-
// NumElt = -CurrentPart * RunTimeVF
2217+
// NumElt = Stride * CurrentPart * RunTimeVF
22172218
Value *NumElt = Builder.CreateMul(
2218-
ConstantInt::get(IndexTy, -(int64_t)CurrentPart), RunTimeVF);
2219-
// LastLane = 1 - RunTimeVF
2220-
Value *LastLane = Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
2219+
ConstantInt::get(IndexTy, Stride * (int64_t)CurrentPart), RunTimeVF);
2220+
// LastLane = Stride * (RunTimeVF - 1)
2221+
Value *LastLane = Builder.CreateSub(RunTimeVF, ConstantInt::get(IndexTy, 1));
2222+
if (Stride != 1)
2223+
LastLane = Builder.CreateMul(ConstantInt::get(IndexTy, Stride), LastLane);
22212224
Value *Ptr = State.get(getOperand(0), VPLane(0));
22222225
Value *ResultPtr =
22232226
Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", getGEPNoWrapFlags());
@@ -2242,7 +2245,7 @@ void VPVectorPointerRecipe::execute(VPTransformState &State) {
22422245
auto &Builder = State.Builder;
22432246
unsigned CurrentPart = getUnrollPart(*this);
22442247
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2245-
CurrentPart, Builder);
2248+
/*IsUnitStride*/ true, CurrentPart, Builder);
22462249
Value *Ptr = State.get(getOperand(0), VPLane(0));
22472250

22482251
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2496,8 +2496,9 @@ void VPlanTransforms::createInterleaveGroups(
24962496
if (IG->isReverse()) {
24972497
auto *GEP = dyn_cast<GetElementPtrInst>(
24982498
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts());
2499-
auto *ReversePtr = new VPReverseInterleavePtrRecipe(
2500-
Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos), IG->getFactor(),
2499+
auto *ReversePtr = new VPVectorEndPointerRecipe(
2500+
Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
2501+
-(int64_t)IG->getFactor(),
25012502
GEP && GEP->isInBounds() ? GEPNoWrapFlags::inBounds()
25022503
: GEPNoWrapFlags::none(),
25032504
InsertPos->getDebugLoc());

llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -367,10 +367,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
367367
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
368368
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
369369
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
370-
; CHECK-NEXT: [[TMP15:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
371-
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i32 [[TMP15]], 1
372-
; CHECK-NEXT: [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
373-
; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
370+
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 3
371+
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i64 2, [[TMP6]]
374372
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
375373
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
376374
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -381,10 +379,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
381379
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
382380
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
383381
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
384-
; CHECK-NEXT: [[TMP21:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
385-
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP21]], 1
386-
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
387-
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
382+
; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP0]], 3
383+
; CHECK-NEXT: [[TMP18:%.*]] = sub nsw i64 2, [[TMP15]]
388384
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
389385
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
390386
; CHECK-NEXT: [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
@@ -1579,10 +1575,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
15791575
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
15801576
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
15811577
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
1582-
; CHECK-NEXT: [[TMP6:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
1583-
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 2
1584-
; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]]
1585-
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
1578+
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 4
1579+
; CHECK-NEXT: [[TMP9:%.*]] = sub nsw i64 4, [[TMP6]]
15861580
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]]
15871581
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
15881582
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
@@ -1599,10 +1593,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
15991593
; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
16001594
; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
16011595
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
1602-
; CHECK-NEXT: [[TMP22:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
1603-
; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 2
1604-
; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]]
1605-
; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64
1596+
; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP0]], 4
1597+
; CHECK-NEXT: [[TMP25:%.*]] = sub nsw i64 4, [[TMP22]]
16061598
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]]
16071599
; CHECK-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
16081600
; CHECK-NEXT: [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])

0 commit comments

Comments
 (0)