Skip to content

Commit fff12fb

Browse files
authored
[VectorCombine] Fix the type used in foldShuffleOfIntrinsics Cost. (#138419)
The shuffle needn't be twice the original number of vector elements, so the intermediate type used between the shuffle and the intrinsic should use the ShuffleDstTy number of elements. I found this when looking at shuffle costs and do not have test where it alters the output, but have added some cases where the shuffle output is not twice the size of the input.
1 parent b0979b8 commit fff12fb

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2376,7 +2376,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
23762376
} else {
23772377
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
23782378
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
2379-
VecTy->getNumElements() * 2));
2379+
ShuffleDstTy->getNumElements()));
23802380
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
23812381
VecTy, OldMask, CostKind);
23822382
}

llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,20 @@ entry:
6969
ret <8 x i1> %4
7070
}
7171

72+
define <2 x i1> @test4b(<4 x float> %0, <4 x float> %1) {
73+
; CHECK-LABEL: @test4b(
74+
; CHECK-NEXT: entry:
75+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <2 x i32> <i32 0, i32 4>
76+
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[TMP2]], i32 0)
77+
; CHECK-NEXT: ret <2 x i1> [[TMP3]]
78+
;
79+
entry:
80+
%2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
81+
%3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
82+
%4 = shufflevector <4 x i1> %2, <4 x i1> %3, <2 x i32> <i32 0, i32 4>
83+
ret <2 x i1> %4
84+
}
85+
7286
define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
7387
; CHECK-LABEL: @test5(
7488
; CHECK-NEXT: entry:
@@ -84,6 +98,26 @@ entry:
8498
ret <8 x float> %6
8599
}
86100

101+
define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4 x float> %a2, <4 x float> %b2, <4 x float> %c2) {
102+
; SSE-LABEL: @test6(
103+
; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], <2 x i32> <i32 0, i32 4>
104+
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B1:%.*]], <4 x float> [[B2:%.*]], <2 x i32> <i32 0, i32 4>
105+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[C1:%.*]], <4 x float> [[C2:%.*]], <2 x i32> <i32 0, i32 4>
106+
; SSE-NEXT: [[S:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
107+
; SSE-NEXT: ret <2 x float> [[S]]
108+
;
109+
; AVX-LABEL: @test6(
110+
; AVX-NEXT: [[F1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A1:%.*]], <4 x float> [[B1:%.*]], <4 x float> [[C1:%.*]])
111+
; AVX-NEXT: [[F2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A2:%.*]], <4 x float> [[B2:%.*]], <4 x float> [[C2:%.*]])
112+
; AVX-NEXT: [[S:%.*]] = shufflevector <4 x float> [[F1]], <4 x float> [[F2]], <2 x i32> <i32 0, i32 4>
113+
; AVX-NEXT: ret <2 x float> [[S]]
114+
;
115+
%f1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1)
116+
%f2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a2, <4 x float> %b2, <4 x float> %c2)
117+
%s = shufflevector <4 x float> %f1, <4 x float> %f2, <2 x i32> <i32 0, i32 4>
118+
ret <2 x float> %s
119+
}
120+
87121
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
88122
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
89123
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)

0 commit comments

Comments
 (0)