diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 380faa6cf6939..6005bca9e91e0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3712,7 +3712,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-    bool HasRealUse, const Instruction *I, Value *Scalar,
+    const Instruction *I, Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -3732,12 +3732,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     }
 
     // The element at index zero is already inside the vector.
-    // - For a physical (HasRealUse==true) insert-element or extract-element
+    // - For a insert-element or extract-element
     // instruction that extracts integers, an explicit FPR -> GPR move is
     // needed. So it has non-zero cost.
-    // - For the rest of cases (virtual instruction or element type is float),
-    // consider the instruction free.
-    if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
+    if (Index == 0 && !Val->getScalarType()->isIntegerTy())
       return 0;
 
     // This is recognising a LD1 single-element structure to one lane of one
@@ -3887,25 +3885,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    unsigned Index,
                                                    const Value *Op0,
                                                    const Value *Op1) const {
-  bool HasRealUse =
-      Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
+  if (Index == 0 && Op0 && isa<PoisonValue>(Op0))
+    return 0;
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
-                                  Scalar, ScalarUserAndIdx);
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
+                                  ScalarUserAndIdx);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index) const {
-  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
-                                  true /* HasRealUse */, &I);
+  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 9ada70bd7086a..5c502540377a6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
 
   // A helper function called by 'getVectorInstrCost'.
   //
-  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
-  // indicates whether the vector instruction is available in the input IR or
-  // just imaginary in vectorizer passes.
-  /// \param ScalarUserAndIdx encodes the information about extracts from a
+  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
+  // \param ScalarUserAndIdx encodes the information about extracts from a
   /// vector with 'Scalar' being the value being extracted,'User' being the user
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCostHelper(
       unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-      bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
+      const Instruction *I = nullptr, Value *Scalar = nullptr,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
 
 public:
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll
index 521264be8b31c..3432b040939e0 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll
@@ -12,11 +12,11 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index b484f8f6c60ba..21e0356fd7321 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index 519b8ecf6dc76..27dd42297bfab 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
index 2a8609d2f418b..826605450a2d8 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 41c272291d7ca..4579acb9b3555 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -93,36 +93,36 @@ define void @insert_subvec() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -369,7 +369,7 @@ define void @multipart() {
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
@@ -408,10 +408,10 @@ define void @vst3(ptr %p) {
 ; CHECK-LABEL: 'vst3'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found costs of 24 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
@@ -452,10 +452,10 @@ define void @vst4(ptr %p) {
 ; CHECK-LABEL: 'vst4'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 32 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 09f116f01ec77..4a003a0085c23 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v8i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0
 ;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -14,7 +14,7 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 
 define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v16i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:30 Lat:60 SizeLat:60 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0
 ;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -32,7 +32,7 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'sel_v8i16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0
 ;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index ec84c58bf9681..fa889cc12dc4f 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -7,15 +7,15 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -44,7 +44,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -56,7 +56,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -101,7 +101,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -113,7 +113,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -1364,34 +1364,34 @@ define void @match() #3 {
 ; CHECK-VSCALE-1-LABEL: 'match'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'match'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'match'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll
index aadaf1ffffb23..4fd40c898e709 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll
@@ -206,10 +206,26 @@ define <1 x i32> @test_dot_product_with_transposed_shuffle_op(<4 x i32> %a, <2 x
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i64 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <2 x i32> [[SHUFFLE]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP9]])
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <2 x i32> [[SHUFFLE]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <1 x i32> poison, i32 [[TMP10]], i64 0
-; CHECK-NEXT:    ret <1 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <1 x i32> poison, i32 [[TMP20]], i64 0
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <2 x i32> [[B:%.*]], <2 x i32> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> poison, i32 [[TMP13]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]]
+; CHECK-NEXT:    [[BLOCK4:%.*]] = shufflevector <1 x i32> [[TMP12]], <1 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> poison, i32 [[TMP15]], i64 0
+; CHECK-NEXT:    [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = mul <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add <1 x i32> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <1 x i32> [[TMP17]], <1 x i32> poison, <1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <1 x i32> poison, <1 x i32> [[TMP18]], <1 x i32> <i32 1>
+; CHECK-NEXT:    ret <1 x i32> [[TMP19]]
 ;
 entry:
   %t.a = tail call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> %a, i32 2, i32 2)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 5e3fd156666f5..410696260a855 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -16,12 +16,11 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
index 2b5ee59aeb163..96dd691c4816e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
@@ -5,24 +5,21 @@ define i16 @foo(i16 %in1, i16 %in2) {
 ; CHECK-LABEL: define i16 @foo(
 ; CHECK-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535)
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533)
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[TMP10]], 65535
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[AND1]], 65533
 ; CHECK-NEXT:    [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
 ; CHECK-NEXT:    [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[TMP13]], 65535
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533
 ; CHECK-NEXT:    [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605
 ; CHECK-NEXT:    [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
index a1f4590a56919..04c69106d97ff 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 667fc41c069e1..10a17f7e3f9a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -645,20 +645,21 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo)
 ; CHECK-NEXT:    [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
 ; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
 ; CHECK-NEXT:    [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
 ; CHECK-NEXT:    [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0
+; CHECK-NEXT:    [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8
 ; CHECK-NEXT:    [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
 ; CHECK-NEXT:    [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
 ; CHECK-NEXT:    store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O3]], 0
+; CHECK-NEXT:    [[O2:%.*]] = or i8 [[E4]], [[E3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i8 [[O3]], [[O2]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O4]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %l = load <2 x i64>, ptr %values, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 3771ec4bda88b..fae0bde4f7e97 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -4,8 +4,20 @@
 define void @h() {
 ; CHECK-LABEL: define void @h() {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV9:%.*]] = zext i16 0 to i32
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[CONV9]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+; CHECK-NEXT:    store <4 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 poison, i32 poison>, i32 [[CONV9]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP4]], <2 x i32> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16>
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr [[ARRAYIDX18]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 89e133bb1c6a1..ff36a66ef9931 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; These tests check that we remove from consideration pairs of seed
@@ -26,7 +26,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '4'
+; YAML-NEXT:   - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -36,7 +36,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - Cost:            '10'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -46,40 +46,48 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[X:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD16:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i64 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP24]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[T12]], i64 0
+; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP21]], i64 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
@@ -129,7 +137,7 @@ for.body:
 ; YAML:      Function:        getelementptr_2x32
 ; YAML:     Args:
 ; YAML:        - String:          'SLP vectorized with cost '
-; YAML:        - Cost:            '4'
+; YAML:        - Cost:            '10'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -139,35 +147,45 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[Z:%.*]], i64 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i64 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i64 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = sext i32 [[T4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP11]], i64 0
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[T4]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[G]], i64 [[TMP7]]
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP13]], i64 4
+; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[T11:%.*]] = extractelement <2 x i32> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[T11]] to i64
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP6]], i64 0)
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[T12]], i64 0
+; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP18]], i64 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index ac476c521a591..cab723823a78f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -14,7 +14,7 @@
 ; YAML-NEXT:  Function:        test_i16_extend
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-20'
+; YAML-NEXT:    - Cost:            '-16
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '5'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 9f5744b17cb79..929fb29a4a679 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -600,15 +600,27 @@ bb15:                                             ; preds = %bb15, %bb14
 define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
 ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 300>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = fptosi float [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 10 to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 2, [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index 07411cacb3626..8561a00490bfa 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,159 +14,387 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2
+; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
+; CHECK-NEXT:    [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]]
+; CHECK-NEXT:    [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2
+; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
+; CHECK-NEXT:    [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]]
+; CHECK-NEXT:    [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]]
+; CHECK-NEXT:    [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2
+; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2
+; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]]
+; CHECK-NEXT:    [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2
+; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]]
+; CHECK-NEXT:    [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2
+; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]]
+; CHECK-NEXT:    [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2
+; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]]
+; CHECK-NEXT:    [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2
+; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2
+; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
+; CHECK-NEXT:    [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]]
+; CHECK-NEXT:    [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]]
+; CHECK-NEXT:    [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2
+; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
+; CHECK-NEXT:    [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]]
+; CHECK-NEXT:    [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]]
+; CHECK-NEXT:    [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2
+; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]]
+; CHECK-NEXT:    [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2
+; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2
+; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]]
+; CHECK-NEXT:    [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2
+; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]]
+; CHECK-NEXT:    [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2
+; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]]
+; CHECK-NEXT:    [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2
+; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2
+; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
+; CHECK-NEXT:    [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]]
+; CHECK-NEXT:    [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]]
+; CHECK-NEXT:    [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2
+; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
+; CHECK-NEXT:    [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]]
+; CHECK-NEXT:    [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]]
+; CHECK-NEXT:    [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2
+; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]]
+; CHECK-NEXT:    [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2
+; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]]
+; CHECK-NEXT:    [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2
+; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2
+; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]]
+; CHECK-NEXT:    [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2
+; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]]
+; CHECK-NEXT:    [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2
+; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]]
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2
+; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
+; CHECK-NEXT:    [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]]
+; CHECK-NEXT:    [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]]
+; CHECK-NEXT:    [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2
+; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
+; CHECK-NEXT:    [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]]
+; CHECK-NEXT:    [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]]
+; CHECK-NEXT:    [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2
+; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]]
+; CHECK-NEXT:    [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3
+; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2
+; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]]
+; CHECK-NEXT:    [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2
+; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]]
+; CHECK-NEXT:    [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5
+; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2
+; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6
+; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2
+; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]]
+; CHECK-NEXT:    [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7
+; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2
+; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]]
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2
+; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
+; CHECK-NEXT:    [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]]
+; CHECK-NEXT:    [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]]
+; CHECK-NEXT:    [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2
+; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
+; CHECK-NEXT:    [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]]
+; CHECK-NEXT:    [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]]
+; CHECK-NEXT:    [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2
+; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2
+; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]]
+; CHECK-NEXT:    [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2
+; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]]
+; CHECK-NEXT:    [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2
+; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]]
+; CHECK-NEXT:    [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5
+; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2
+; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]]
+; CHECK-NEXT:    [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6
+; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2
+; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2
+; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]]
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2
+; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
+; CHECK-NEXT:    [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]]
+; CHECK-NEXT:    [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]]
+; CHECK-NEXT:    [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2
+; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
+; CHECK-NEXT:    [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]]
+; CHECK-NEXT:    [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]]
+; CHECK-NEXT:    [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2
+; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2
+; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]]
+; CHECK-NEXT:    [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3
+; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2
+; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]]
+; CHECK-NEXT:    [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2
+; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]]
+; CHECK-NEXT:    [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5
+; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2
+; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]]
+; CHECK-NEXT:    [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6
+; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2
+; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]]
+; CHECK-NEXT:    [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7
+; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2
+; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]]
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16)
-; CHECK-NEXT:    [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24)
-; CHECK-NEXT:    [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40)
-; CHECK-NEXT:    [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48)
-; CHECK-NEXT:    [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56)
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]]
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]]
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]]
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]]
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]]
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56
+; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2
+; CHECK-NEXT:    [[TMP74:%.*]] = zext i16 [[TMP56]] to i32
 ; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]]
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57
+; CHECK-NEXT:    [[MUL_766:%.*]] = mul nuw nsw i32 [[TMP74]], [[TMP74]]
+; CHECK-NEXT:    [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]]
+; CHECK-NEXT:    [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2
+; CHECK-NEXT:    [[TMP75:%.*]] = zext i16 [[TMP57]] to i32
 ; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58
+; CHECK-NEXT:    [[MUL_1_7:%.*]] = mul nuw nsw i32 [[TMP75]], [[TMP75]]
+; CHECK-NEXT:    [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]]
+; CHECK-NEXT:    [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2
+; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2
+; CHECK-NEXT:    [[TMP76:%.*]] = zext i16 [[TMP58]] to i32
 ; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59
+; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul nuw nsw i32 [[TMP76]], [[TMP76]]
+; CHECK-NEXT:    [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]]
+; CHECK-NEXT:    [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3
+; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2
+; CHECK-NEXT:    [[TMP77:%.*]] = zext i16 [[TMP59]] to i32
 ; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60
+; CHECK-NEXT:    [[MUL_3_7:%.*]] = mul nuw nsw i32 [[TMP77]], [[TMP77]]
+; CHECK-NEXT:    [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]]
+; CHECK-NEXT:    [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2
+; CHECK-NEXT:    [[TMP78:%.*]] = zext i16 [[TMP60]] to i32
 ; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61
+; CHECK-NEXT:    [[MUL_4_7:%.*]] = mul nuw nsw i32 [[TMP78]], [[TMP78]]
+; CHECK-NEXT:    [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]]
+; CHECK-NEXT:    [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5
+; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2
+; CHECK-NEXT:    [[TMP79:%.*]] = zext i16 [[TMP61]] to i32
 ; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62
+; CHECK-NEXT:    [[MUL_5_7:%.*]] = mul nuw nsw i32 [[TMP79]], [[TMP79]]
+; CHECK-NEXT:    [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]]
+; CHECK-NEXT:    [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6
+; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2
+; CHECK-NEXT:    [[TMP80:%.*]] = zext i16 [[TMP62]] to i32
 ; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63
+; CHECK-NEXT:    [[MUL_6_7:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP80]]
+; CHECK-NEXT:    [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]]
+; CHECK-NEXT:    [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2
+; CHECK-NEXT:    [[TMP81:%.*]] = zext i16 [[TMP63]] to i32
 ; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]])
+; CHECK-NEXT:    [[MUL_7_7:%.*]] = mul nuw nsw i32 [[TMP81]], [[TMP81]]
+; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]]
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
 ; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP82]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
@@ -573,13 +801,101 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
+; CHECK-NEXT:    [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]]
+; CHECK-NEXT:    [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]]
+; CHECK-NEXT:    [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
+; CHECK-NEXT:    [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]]
+; CHECK-NEXT:    [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]]
+; CHECK-NEXT:    [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]]
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
+; CHECK-NEXT:    [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]]
+; CHECK-NEXT:    [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]]
+; CHECK-NEXT:    [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]]
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
+; CHECK-NEXT:    [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]]
+; CHECK-NEXT:    [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]]
+; CHECK-NEXT:    [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]]
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
+; CHECK-NEXT:    [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]]
+; CHECK-NEXT:    [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]]
+; CHECK-NEXT:    [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]]
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
+; CHECK-NEXT:    [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]]
+; CHECK-NEXT:    [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]]
+; CHECK-NEXT:    [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
+; CHECK-NEXT:    [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]]
+; CHECK-NEXT:    [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]]
+; CHECK-NEXT:    [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]]
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
+; CHECK-NEXT:    [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[ADD_14]], [[CONV_15]]
+; CHECK-NEXT:    [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[MUL_15]], [[ADD11_14]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[INC13]] = add nuw nsw i32 [[Y_038]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
similarity index 94%
rename from llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
index 4478eab7b827a..15f4cffe77910 100644
--- a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index afaf6b98e5081..094d60b66b393 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -90,14 +90,14 @@ entry:
 define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i64, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i64, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i64, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i64, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i64, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
@@ -131,14 +131,14 @@ entry:
 define void @splat_loads_i32(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i32, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i32, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i32, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i32, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i32> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
index aeb82d800a2f7..3c2f9e4d0ab5d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
@@ -4,17 +4,17 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_REAL32_PRE:%.*]] = load i32, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_IMAG33_PRE:%.*]] = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1
 ; CHECK-NEXT:    br label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN:.*]]:
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[G_2197_IMAG33_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[G_2197_REAL32_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr null, align 1
 ; CHECK-NEXT:    br label %[[TRAP:.*]]
 ; CHECK:       [[BB3:.*:]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr null, align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[TRAP]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
index 3cb81b72d26a1..14ce08cb7aebe 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
@@ -6,19 +6,36 @@ define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
 ; CHECK-LABEL: define void @should_vectorize_gep
 ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]]
+; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2
+; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2
+; CHECK-NEXT:    [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64
+; CHECK-NEXT:    [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]]
+; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2
+; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2
+; CHECK-NEXT:    [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2
+; CHECK-NEXT:    [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64
+; CHECK-NEXT:    [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]]
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2
+; CHECK-NEXT:    [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64
+; CHECK-NEXT:    [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2
+; CHECK-NEXT:    [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64
+; CHECK-NEXT:    [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]]
 ; CHECK-NEXT:    call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index 00bd3eb232981..15518c9c57140 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -249,7 +249,7 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
 ; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
new file mode 100644
index 0000000000000..f90456297d7cb
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[PHI7]], i32 0
+; CHECK-NEXT:    switch i32 0, label [[BB16:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB14:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB11:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB11]]
+; CHECK:       bb10:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, [[BB9:%.*]] ], [ [[TMP1]], [[BB1]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb14:
+; CHECK-NEXT:    ret void
+; CHECK:       bb15:
+; CHECK-NEXT:    ret void
+; CHECK:       bb16:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ], [ poison, [[BB25:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb25:
+; CHECK-NEXT:    switch i32 0, label [[BB16]] [
+; CHECK-NEXT:      i32 0, label [[BB14]]
+; CHECK-NEXT:      i32 1, label [[BB15:%.*]]
+; CHECK-NEXT:    ]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi2 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi3 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi4 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi5 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi6 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi7 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi8 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb11
+  ]
+
+bb9:
+  br label %bb11
+
+bb10:
+  br label %bb1
+
+bb11:
+  %phi12 = phi i32 [ 0, %bb9 ], [ %phi7, %bb1 ]
+  %phi13 = phi i32 [ 0, %bb9 ], [ undef, %bb1 ]
+  ret void
+
+bb14:
+  ret void
+
+bb15:
+  ret void
+
+bb16:
+  %phi17 = phi i32 [ %phi, %bb1 ], [ 0, %bb25 ]
+  %phi18 = phi i32 [ %phi2, %bb1 ], [ 0, %bb25 ]
+  %phi19 = phi i32 [ %phi3, %bb1 ], [ 0, %bb25 ]
+  %phi20 = phi i32 [ %phi4, %bb1 ], [ 0, %bb25 ]
+  %phi21 = phi i32 [ %phi5, %bb1 ], [ 0, %bb25 ]
+  %phi22 = phi i32 [ %phi6, %bb1 ], [ 0, %bb25 ]
+  %phi23 = phi i32 [ %phi7, %bb1 ], [ 0, %bb25 ]
+  %phi24 = phi i32 [ %phi8, %bb1 ], [ 0, %bb25 ]
+  ret void
+
+bb25:
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb15
+  ]
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
similarity index 91%
rename from llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
rename to llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
index 002b9a70255da..278e55c67f23f 100644
--- a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %sptr, i64 %0) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
new file mode 100644
index 0000000000000..0dac02b0bcc09
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], splat (i32 16)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], splat (i32 16)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    br i1 true, label [[BB3]], label [[BB2]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[B]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[C]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[B]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %tobool3.not, label %bb1, label %bb2
+
+bb1:
+  %conv1.i.us = ashr i32 %0, 16
+  %cmp2.i.us = icmp slt i32 %conv1.i.us, %0
+  %sext26.us = zext i1 %cmp2.i.us to i32
+  %conv1.i.us.5 = ashr i32 %0, 16
+  %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0
+  %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32
+  %conv1.i.us.6 = ashr i32 %0, 16
+  %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0
+  %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32
+  %conv1.i.us.7 = ashr i32 %0, 16
+  %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0
+  %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32
+  br label %bb3
+
+bb2:
+  %cmp2.i = icmp sgt i32 %0, 0
+  %1 = zext i1 %cmp2.i to i32
+  %cond.i = select i1 %tobool3.not, i32 %0, i32 %1
+  %sext26 = shl i32 %cond.i, 16
+  %conv13 = ashr i32 %sext26, 16
+  %cmp2.i.5 = icmp sgt i32 %0, 0
+  %2 = zext i1 %cmp2.i.5 to i32
+  %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2
+  %sext26.5 = shl i32 %cond.i.5, 16
+  %conv13.5 = ashr i32 %sext26.5, 16
+  %cmp2.i.6 = icmp sgt i32 %0, 0
+  %3 = zext i1 %cmp2.i.6 to i32
+  %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3
+  %sext26.6 = shl i32 %cond.i.6, 16
+  %conv13.6 = ashr i32 %sext26.6, 16
+  %cmp2.i.7 = icmp sgt i32 %0, 0
+  %4 = zext i1 %cmp2.i.7 to i32
+  %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4
+  %sext26.7 = shl i32 %cond.i.7, 16
+  %conv13.7 = ashr i32 %sext26.7, 16
+  br i1 true, label %bb3, label %bb2
+
+bb3:
+  %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ]
+  %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ]
+  %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ]
+  %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ]
+  store i32 %conv13p, ptr %b, align 16
+  store i32 %conv13.5p, ptr %a, align 8
+  store i32 %conv13.6p, ptr %c, align 16
+  store i32 %conv13.7p, ptr %b, align 8
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
index 9b6511d0d8284..d880c6b1783c8 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
@@ -17,12 +17,12 @@ define <2 x i32> @test(i32 %arg) {
 ; AARCH64-LABEL: define <2 x i32> @test(
 ; AARCH64-SAME: i32 [[ARG:%.*]]) {
 ; AARCH64-NEXT:  bb:
-; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ARG]], i32 0
-; AARCH64-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer
-; AARCH64-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP2:%.*]] = or i32 [[ARG]], 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = mul i32 0, 1
 ; AARCH64-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]]
 ; AARCH64-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP3]], i32 1
 ; AARCH64-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 bb:
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index e6e5f5196d3da..5c035d29a7ea2 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) {
 ; Scalarizing the load for multiple constant indices may not be profitable.
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[TMP1]], i32 0, i32 6
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <8 x i32>, ptr %x, align 16