Skip to content

Commit 6ed7723

Browse files
committed
[AArch64,TTI] Remove RealUse check for vector insert/extract costs.
getVectorInstrCostHelper would return costs of zero for vector inserts/extracts that move data between GPR and vector registers, if there was no 'real' use, i.e. there was no corresponding existing instruction. This meant that passes like LoopVectorize and SLPVectorizer, which likely are the main users of the interface, would understimate the cost of insert/extracts that move data between GPR and vector registers, which has non-trivial costs. The patch removes the special case and only returns costs of zero for lane 0 if it there is no need to transfer between integer and vector registers. This impacts a number of SLP test, and most of them look like improvements to me. I am seeing +2% end-to-end improvements on SLP-heavy workloads.
1 parent 6731f15 commit 6ed7723

29 files changed

+890
-319
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3712,7 +3712,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
37123712

37133713
InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37143714
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
3715-
bool HasRealUse, const Instruction *I, Value *Scalar,
3715+
const Instruction *I, Value *Scalar,
37163716
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
37173717
assert(Val->isVectorTy() && "This must be a vector type");
37183718

@@ -3732,14 +3732,13 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
37323732
}
37333733

37343734
// The element at index zero is already inside the vector.
3735-
// - For a physical (HasRealUse==true) insert-element or extract-element
3735+
// - For a insert-element or extract-element
37363736
// instruction that extracts integers, an explicit FPR -> GPR move is
37373737
// needed. So it has non-zero cost.
3738-
// - For the rest of cases (virtual instruction or element type is float),
3739-
// consider the instruction free.
3740-
if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
3738+
if (Index == 0 && !Val->getScalarType()->isIntegerTy())
37413739
return 0;
37423740

3741+
37433742
// This is recognising a LD1 single-element structure to one lane of one
37443743
// register instruction. I.e., if this is an `insertelement` instruction,
37453744
// and its second operand is a load, then we will generate a LD1, which
@@ -3887,16 +3886,14 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
38873886
unsigned Index,
38883887
const Value *Op0,
38893888
const Value *Op1) const {
3890-
bool HasRealUse =
3891-
Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
3892-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
3889+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
38933890
}
38943891

38953892
InstructionCost AArch64TTIImpl::getVectorInstrCost(
38963893
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
38973894
Value *Scalar,
38983895
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
3899-
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
3896+
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
39003897
Scalar, ScalarUserAndIdx);
39013898
}
39023899

@@ -3905,7 +3902,7 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
39053902
TTI::TargetCostKind CostKind,
39063903
unsigned Index) const {
39073904
return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
3908-
true /* HasRealUse */, &I);
3905+
&I);
39093906
}
39103907

39113908
InstructionCost AArch64TTIImpl::getScalarizationOverhead(

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
6565

6666
// A helper function called by 'getVectorInstrCost'.
6767
//
68-
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
69-
// indicates whether the vector instruction is available in the input IR or
70-
// just imaginary in vectorizer passes.
71-
/// \param ScalarUserAndIdx encodes the information about extracts from a
68+
// 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
69+
// \param ScalarUserAndIdx encodes the information about extracts from a
7270
/// vector with 'Scalar' being the value being extracted,'User' being the user
7371
/// of the extract(nullptr if user is not known before vectorization) and
7472
/// 'Idx' being the extract lane.
7573
InstructionCost getVectorInstrCostHelper(
7674
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
77-
bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
75+
const Instruction *I = nullptr, Value *Scalar = nullptr,
7876
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
7977

8078
public:

llvm/test/Analysis/CostModel/AArch64/reduce-add.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ define void @reduce() {
1212
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
15-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
15+
; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
1616
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
1717
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
19-
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
19+
; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
2121
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
2222
; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-and.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-or.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)

llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @reduce() {
1313
; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
1414
; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
1515
; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
16-
; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
17-
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
16+
; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
17+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
1818
; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
1919
; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
2020
; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)

0 commit comments

Comments
 (0)