diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index e9bc6d947b0d9..f7beca1b8b77e 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6750,6 +6750,21 @@ inline std::optional isDUPQMask(ArrayRef Mask, unsigned Segments, return std::nullopt; } +/// isDUPFirstSegmentMask - matches a splat of the first 128b segment. +inline bool isDUPFirstSegmentMask(ArrayRef Mask, unsigned Segments, + unsigned SegmentSize) { + // Make sure there's no size changes. + if (SegmentSize * Segments != Mask.size()) + return false; + + // Check that all lanes refer to the equivalent lane in the first segment. + // Undef/poison lanes (<0) are also accepted. + return all_of(enumerate(Mask), [&](auto P) { + const unsigned IndexWithinSegment = P.index() % SegmentSize; + return P.value() < 0 || unsigned(P.value()) == IndexWithinSegment; + }); +} + } // namespace llvm #endif diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3387dee8aa4c8..22074f32a573f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5600,9 +5600,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, } // Segmented shuffle matching. - if ((ST->hasSVE2p1() || ST->hasSME2p1()) && - ST->isSVEorStreamingSVEAvailable() && Kind == TTI::SK_PermuteSingleSrc && - isa(SrcTy) && !Mask.empty() && + if (Kind == TTI::SK_PermuteSingleSrc && isa(SrcTy) && + !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf( AArch64::SVEBitsPerBlock)) { @@ -5612,7 +5611,13 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, unsigned SegmentElts = VTy->getNumElements() / Segments; // dupq zd.t, zn.t[idx] - if (isDUPQMask(Mask, Segments, SegmentElts)) + if ((ST->hasSVE2p1() || ST->hasSME2p1()) && + ST->isSVEorStreamingSVEAvailable() && + isDUPQMask(Mask, Segments, SegmentElts)) + return LT.first; + + // mov zd.q, vn + if (isDUPFirstSegmentMask(Mask, Segments, SegmentElts)) return LT.first; } diff --git a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll index 790f49f1d3b82..8b94cefbad63b 100644 --- a/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll +++ b/llvm/test/Analysis/CostModel/AArch64/segmented-shufflevector-patterns.ll @@ -49,5 +49,53 @@ define void @dup_within_each_segment_512b() #1 { ret void } +define void @dup_whole_segment_256b() #0 { +; CHECK-LABEL: 'dup_whole_segment_256b' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> + %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> + %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> + %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> + %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> + ret void +} + +define void @dup_whole_segment_512b() #1 { +; CHECK-LABEL: 'dup_whole_segment_512b' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %dup_seg_b = shufflevector <32 x i8> poison, <32 x i8> poison, <32 x i32> + %dup_seg_h = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> + %dup_seg_s = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> + %dup_seg_d = shufflevector <4 x i64> poison, <4 x i64> poison, <4 x i32> + %dup_seg_512b_d = shufflevector <8 x double> poison, <8 x double> poison, <8 x i32> + %dup_seg_s_with_poison = shufflevector <8 x float> poison, <8 x float> poison, <8 x i32> + ret void +} + attributes #0 = { noinline vscale_range(2,2) } attributes #1 = { noinline vscale_range(4,4) }