Skip to content

Commit a6c9b67

Browse files
committed
[AArch64,TTI] Remove RealUse check for vector insert/extract costs.
getVectorInstrCostHelper would return costs of zero for vector inserts/extracts that move data between GPR and vector registers, if there was no 'real' use, i.e. there was no corresponding existing instruction. This meant that passes like LoopVectorize and SLPVectorizer, which likely are the main users of the interface, would understimate the cost of insert/extracts that move data between GPR and vector registers, which has non-trivial costs. The patch removes the special case and only returns costs of zero for lane 0 if it there is no need to transfer between integer and vector registers. This impacts a number of SLP test, and most of them look like improvements to me. I am seeing +2% end-to-end improvements on SLP-heavy workloads.
1 parent 6d67794 commit a6c9b67

29 files changed

+898
-325
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3712,7 +3712,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
37123712

37133713
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37143714
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3715-
bool HasRealUse, const Instruction *I, Value *Scalar,
3715+
const Instruction *I, Value *Scalar,
37163716
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
37173717
assert(Val->isVectorTy() && "This must be a vector type");
37183718

@@ -3732,12 +3732,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37323732
}
37333733

37343734
// The element at index zero is already inside the vector.
3735-
// - For a physical (HasRealUse==true) insert-element or extract-element
3735+
// - For a insert-element or extract-element
37363736
// instruction that extracts integers, an explicit FPR -> GPR move is
37373737
// needed. So it has non-zero cost.
3738-
// - For the rest of cases (virtual instruction or element type is float),
3739-
// consider the instruction free.
3740-
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3738+
if (Index == 0 && !Val->getScalarType()->isIntegerTy())
37413739
return 0;
37423740

37433741
// This is recognising a LD1 single-element structure to one lane of one
@@ -3887,25 +3885,22 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
38873885
unsigned Index,
38883886
const Value *Op0,
38893887
const Value *Op1) const {
3890-
bool HasRealUse =
3891-
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3892-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
3888+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
38933889
}
38943890

38953891
InstructionCost AArch64TTIImpl::getVectorInstrCost(
38963892
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
38973893
Value *Scalar,
38983894
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3899-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
3900-
Scalar, ScalarUserAndIdx);
3895+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
3896+
ScalarUserAndIdx);
39013897
}
39023898

39033899
InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
39043900
Type *Val,
39053901
TTI::TargetCostKind CostKind,
39063902
unsigned Index) const {
3907-
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
3908-
true /* HasRealUse */, &I);
3903+
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
39093904
}
39103905

39113906
InstructionCost AArch64TTIImpl::getScalarizationOverhead(

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
6565

6666
// A helper function called by 'getVectorInstrCost'.
6767
//
68-
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
69-
// indicates whether the vector instruction is available in the input IR or
70-
// just imaginary in vectorizer passes.
71-
/// \param ScalarUserAndIdx encodes the information about extracts from a
68+
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
69+
// \param ScalarUserAndIdx encodes the information about extracts from a
7270
/// vector with 'Scalar' being the value being extracted,'User' being the user
7371
/// of the extract(nullptr if user is not known before vectorization) and
7472
/// 'Idx' being the extract lane.
7573
InstructionCost getVectorInstrCostHelper(
7674
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
77-
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
75+
const Instruction *I = nullptr, Value *Scalar = nullptr,
7876
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
7977

8078
public:

llvm/test/Analysis/CostModel/AArch64/reduce-add.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ define void @reduce() {
1212
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
15-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
15+
; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
1616
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
1717
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
19-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
19+
; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
2121
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
2222
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-and.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-or.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)

0 commit comments

Comments
 (0)