diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 380faa6cf6939..6005bca9e91e0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3712,7 +3712,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, - bool HasRealUse, const Instruction *I, Value *Scalar, + const Instruction *I, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { assert(Val->isVectorTy() && "This must be a vector type"); @@ -3732,12 +3732,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper( } // The element at index zero is already inside the vector. - // - For a physical (HasRealUse==true) insert-element or extract-element + // - For a insert-element or extract-element // instruction that extracts integers, an explicit FPR -> GPR move is // needed. So it has non-zero cost. - // - For the rest of cases (virtual instruction or element type is float), - // consider the instruction free. - if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) + if (Index == 0 && !Val->getScalarType()->isIntegerTy()) return 0; // This is recognising a LD1 single-element structure to one lane of one @@ -3887,25 +3885,24 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index, const Value *Op0, const Value *Op1) const { - bool HasRealUse = - Opcode == Instruction::InsertElement && Op0 && !isa(Op0); - return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse); + if (Index == 0 && Op0 && isa(Op0)) + return 0; + return getVectorInstrCostHelper(Opcode, Val, CostKind, Index); } InstructionCost AArch64TTIImpl::getVectorInstrCost( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { - return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr, - Scalar, ScalarUserAndIdx); + return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar, + ScalarUserAndIdx); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const { - return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, - true /* HasRealUse */, &I); + return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I); } InstructionCost AArch64TTIImpl::getScalarizationOverhead( diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 9ada70bd7086a..5c502540377a6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase { // A helper function called by 'getVectorInstrCost'. // - // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse' - // indicates whether the vector instruction is available in the input IR or - // just imaginary in vectorizer passes. - /// \param ScalarUserAndIdx encodes the information about extracts from a + // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; + // \param ScalarUserAndIdx encodes the information about extracts from a /// vector with 'Scalar' being the value being extracted,'User' being the user /// of the extract(nullptr if user is not known before vectorization) and /// 'Idx' being the extract lane. InstructionCost getVectorInstrCostHelper( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, - bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr, + const Instruction *I = nullptr, Value *Scalar = nullptr, ArrayRef> ScalarUserAndIdx = {}) const; public: diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll index 521264be8b31c..3432b040939e0 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-add.ll @@ -12,11 +12,11 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 2 for: %V16i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32i8 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64i8 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) ; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) ; CHECK-NEXT: Cost Model: Found costs of 2 for: %V8i16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V16i16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:3 Lat:4 SizeLat:4 for: %V2i32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 2 for: %V4i32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V8i32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 2 for: %V2i64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll index b484f8f6c60ba..21e0356fd7321 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll @@ -13,8 +13,8 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll index 519b8ecf6dc76..27dd42297bfab 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll @@ -13,8 +13,8 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll index 2a8609d2f418b..826605450a2d8 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll @@ -13,8 +13,8 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) -; CHECK-NEXT: Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll index 41c272291d7ca..4579acb9b3555 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -93,36 +93,36 @@ define void @insert_subvec() { ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> @@ -369,7 +369,7 @@ define void @multipart() { ; CHECK-NEXT: Cost Model: Found costs of 4 for: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 16 for: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 16 for: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found costs of 1 for: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> @@ -408,10 +408,10 @@ define void @vst3(ptr %p) { ; CHECK-LABEL: 'vst3' ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> ; CHECK-NEXT: Cost Model: Found costs of 48 for: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> ; CHECK-NEXT: Cost Model: Found costs of 24 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> ; CHECK-NEXT: Cost Model: Found costs of 48 for: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> ; CHECK-NEXT: Cost Model: Found costs of 5 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> @@ -452,10 +452,10 @@ define void @vst4(ptr %p) { ; CHECK-LABEL: 'vst4' ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found costs of 64 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found costs of 8 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of 32 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> ; CHECK-NEXT: Cost Model: Found costs of 64 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> ; CHECK-NEXT: Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll index 09f116f01ec77..4a003a0085c23 100644 --- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll +++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll @@ -5,7 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: 'sel_v8i8' -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0 ; %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> @@ -14,7 +14,7 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) { define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: 'sel_v16i8' -; CHECK-NEXT: Cost Model: Found costs of RThru:60 CodeSize:30 Lat:60 SizeLat:60 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0 ; %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> @@ -32,7 +32,7 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) { define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: 'sel_v8i16' -; CHECK-NEXT: Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0 ; %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll index ec84c58bf9681..fa889cc12dc4f 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -7,15 +7,15 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define void @vector_insert_extract( %v0, %v1, <16 x i32> %v2) { ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract' -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call @llvm.vector.extract.nxv4i32.nxv16i32( %v1, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %v0, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract' -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call @llvm.vector.extract.nxv4i32.nxv16i32( %v1, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %v0, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void @@ -44,7 +44,7 @@ define void @vector_insert_extract_idxzero_128b() #1 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64( undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call @llvm.vector.insert.nxv16i1.nxv2i1( undef, undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call @llvm.vector.extract.nxv4i1.nxv16i1( undef, i64 0) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call @llvm.vector.insert.nxv2f32.v2f32( undef, <2 x float> undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16( undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call @llvm.vector.insert.nxv4f32.nxv2f32( undef, undef, i64 0) @@ -56,7 +56,7 @@ define void @vector_insert_extract_idxzero_128b() #1 { ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64( undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call @llvm.vector.insert.nxv16i1.nxv2i1( undef, undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call @llvm.vector.extract.nxv4i1.nxv16i1( undef, i64 0) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call @llvm.vector.insert.nxv2f32.v2f32( undef, <2 x float> undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16( undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call @llvm.vector.insert.nxv4f32.nxv2f32( undef, undef, i64 0) @@ -101,7 +101,7 @@ define void @vector_insert_extract_idxzero_256b() #2 { ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call @llvm.vector.insert.nxv16i1.nxv2i1( undef, undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call @llvm.vector.extract.nxv4i1.nxv16i1( undef, i64 0) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call @llvm.vector.insert.nxv2f32.v2f32( undef, <2 x float> undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16( undef, i64 0) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call @llvm.vector.insert.nxv4f32.nxv2f32( undef, undef, i64 0) @@ -113,7 +113,7 @@ define void @vector_insert_extract_idxzero_256b() #2 { ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32( undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call @llvm.vector.insert.nxv16i1.nxv2i1( undef, undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call @llvm.vector.extract.nxv4i1.nxv16i1( undef, i64 0) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1( undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call @llvm.vector.insert.nxv2f32.v2f32( undef, <2 x float> undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16( undef, i64 0) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call @llvm.vector.insert.nxv4f32.nxv2f32( undef, undef, i64 0) @@ -1364,34 +1364,34 @@ define void @match() #3 { ; CHECK-VSCALE-1-LABEL: 'match' ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call @llvm.experimental.vector.match.nxv16i8.v16i8( undef, <16 x i8> undef, undef) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call @llvm.experimental.vector.match.nxv8i16.v8i16( undef, <8 x i16> undef, undef) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call @llvm.experimental.vector.match.nxv4i32.v4i32( undef, <4 x i32> undef, undef) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call @llvm.experimental.vector.match.nxv2i64.v2i64( undef, <2 x i64> undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call @llvm.experimental.vector.match.nxv4i32.v4i32( undef, <4 x i32> undef, undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call @llvm.experimental.vector.match.nxv2i64.v2i64( undef, <2 x i64> undef, undef) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef) -; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef) +; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef) ; CHECK-VSCALE-1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; CHECK-VSCALE-2-LABEL: 'match' ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call @llvm.experimental.vector.match.nxv16i8.v16i8( undef, <16 x i8> undef, undef) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call @llvm.experimental.vector.match.nxv8i16.v8i16( undef, <8 x i16> undef, undef) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call @llvm.experimental.vector.match.nxv4i32.v4i32( undef, <4 x i32> undef, undef) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call @llvm.experimental.vector.match.nxv2i64.v2i64( undef, <2 x i64> undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call @llvm.experimental.vector.match.nxv4i32.v4i32( undef, <4 x i32> undef, undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call @llvm.experimental.vector.match.nxv2i64.v2i64( undef, <2 x i64> undef, undef) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef) -; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef) +; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef) ; CHECK-VSCALE-2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; ; TYPE_BASED_ONLY-LABEL: 'match' ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call @llvm.experimental.vector.match.nxv16i8.v16i8( undef, <16 x i8> undef, undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call @llvm.experimental.vector.match.nxv8i16.v8i16( undef, <8 x i16> undef, undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call @llvm.experimental.vector.match.nxv4i32.v4i32( undef, <4 x i32> undef, undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call @llvm.experimental.vector.match.nxv2i64.v2i64( undef, <2 x i64> undef, undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call @llvm.experimental.vector.match.nxv4i32.v4i32( undef, <4 x i32> undef, undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call @llvm.experimental.vector.match.nxv2i64.v2i64( undef, <2 x i64> undef, undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll index aadaf1ffffb23..4fd40c898e709 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-transpose-int.ll @@ -206,10 +206,26 @@ define <1 x i32> @test_dot_product_with_transposed_shuffle_op(<4 x i32> %a, <2 x ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP7]], <4 x i32> ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> zeroinitializer, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[SHUFFLE]], [[B:%.*]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP9]]) +; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <2 x i32> [[SHUFFLE]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <1 x i32> poison, i32 [[TMP10]], i64 0 -; CHECK-NEXT: ret <1 x i32> [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[SPLIT2]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <1 x i32> poison, i32 [[TMP20]], i64 0 +; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <2 x i32> [[B:%.*]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x i32> [[TMP11]], <1 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x i32> poison, i32 [[TMP13]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT]], <1 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = mul <1 x i32> [[BLOCK]], [[SPLAT_SPLAT]] +; CHECK-NEXT: [[BLOCK4:%.*]] = shufflevector <1 x i32> [[TMP12]], <1 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[SPLIT3]], i64 1 +; CHECK-NEXT: [[SPLAT_SPLATINSERT5:%.*]] = insertelement <1 x i32> poison, i32 [[TMP15]], i64 0 +; CHECK-NEXT: [[SPLAT_SPLAT6:%.*]] = shufflevector <1 x i32> [[SPLAT_SPLATINSERT5]], <1 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = mul <1 x i32> [[BLOCK4]], [[SPLAT_SPLAT6]] +; CHECK-NEXT: [[TMP17:%.*]] = add <1 x i32> [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <1 x i32> [[TMP17]], <1 x i32> poison, <1 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <1 x i32> poison, <1 x i32> [[TMP18]], <1 x i32> +; CHECK-NEXT: ret <1 x i32> [[TMP19]] ; entry: %t.a = tail call <4 x i32> @llvm.matrix.transpose.v4i32(<4 x i32> %a, i32 2, i32 2) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll index 5e3fd156666f5..410696260a855 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -16,12 +16,11 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) { ; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64 ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll index 2b5ee59aeb163..96dd691c4816e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll @@ -5,24 +5,21 @@ define i16 @foo(i16 %in1, i16 %in2) { ; CHECK-LABEL: define i16 @foo( ; CHECK-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533) -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64 +; CHECK-NEXT: [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]] +; CHECK-NEXT: [[AND1:%.*]] = and i64 [[TMP10]], 65535 +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[AND1]], 65533 ; CHECK-NEXT: [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 ; CHECK-NEXT: [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605 ; CHECK-NEXT: [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16 ; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64 +; CHECK-NEXT: [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]] +; CHECK-NEXT: [[AND2:%.*]] = and i64 [[TMP13]], 65535 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533 ; CHECK-NEXT: [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 ; CHECK-NEXT: [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605 ; CHECK-NEXT: [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16 ; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]] diff --git a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll similarity index 89% rename from llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll rename to llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll index a1f4590a56919..04c69106d97ff 100644 --- a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s define void @test() { ; CHECK-LABEL: define void @test() { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll index 667fc41c069e1..10a17f7e3f9a6 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll @@ -645,20 +645,21 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo) ; CHECK-NEXT: [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64> ; CHECK-NEXT: [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8> ; CHECK-NEXT: [[A1:%.*]] = and <16 x i8> [[BC1]], +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8 ; CHECK-NEXT: [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]] ; CHECK-NEXT: [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64> ; CHECK-NEXT: [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8> ; CHECK-NEXT: [[A2:%.*]] = and <16 x i8> [[BC2]], +; CHECK-NEXT: [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0 +; CHECK-NEXT: [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8 ; CHECK-NEXT: [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]] ; CHECK-NEXT: [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1) ; CHECK-NEXT: store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1 ; CHECK-NEXT: [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[O3]], 0 +; CHECK-NEXT: [[O2:%.*]] = or i8 [[E4]], [[E3]] +; CHECK-NEXT: [[O4:%.*]] = or i8 [[O3]], [[O2]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[O4]], 0 ; CHECK-NEXT: ret i1 [[C]] ; %l = load <2 x i64>, ptr %values, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll index 3771ec4bda88b..fae0bde4f7e97 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll @@ -4,8 +4,20 @@ define void @h() { ; CHECK-LABEL: define void @h() { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV9:%.*]] = zext i16 0 to i32 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16 -; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[CONV9]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16> +; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[ARRAYIDX2]], align 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[CONV9]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP4]], <2 x i32> zeroinitializer, i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i32> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = or <4 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i16> +; CHECK-NEXT: store <4 x i16> [[TMP8]], ptr [[ARRAYIDX18]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index 89e133bb1c6a1..ff36a66ef9931 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s ; These tests check that we remove from consideration pairs of seed @@ -26,7 +26,7 @@ ; YAML-NEXT: Function: getelementptr_4x32 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '4' +; YAML-NEXT: - Cost: '8' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' @@ -36,7 +36,7 @@ ; YAML-NEXT: Function: getelementptr_4x32 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '6' +; YAML-NEXT: - Cost: '10' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' @@ -46,40 +46,48 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[X:%.*]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[X:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i64 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[ADD16:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i64 0 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ] -; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP7]], i64 1 +; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP15]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i32> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP17]] ; CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]] +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP16]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = sext i32 [[TMP22]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP23]] ; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 ; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP18]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> , i32 [[ADD11]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[T12]], i64 0 +; CHECK-NEXT: [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP21]], i64 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; @@ -129,7 +137,7 @@ for.body: ; YAML: Function: getelementptr_2x32 ; YAML: Args: ; YAML: - String: 'SLP vectorized with cost ' -; YAML: - Cost: '4' +; YAML: - Cost: '10' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' @@ -139,35 +147,45 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[Y:%.*]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> [[TMP10]], i32 [[Z:%.*]], i64 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i64 0 ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ] -; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i64 1 +; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP12]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[T4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP11]], i64 0 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[T4]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[G]], i64 [[TMP7]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[TMP13]], i64 4 +; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i64 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP4]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP8]] ; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 -; CHECK-NEXT: [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]] -; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]] +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]] +; CHECK-NEXT: [[T11:%.*]] = extractelement <2 x i32> [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP8]], <2 x i32> [[TMP6]], i64 0) -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> , i32 [[ADD11]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[T12]], i64 0 +; CHECK-NEXT: [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP18]], i64 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll index ac476c521a591..cab723823a78f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll @@ -14,7 +14,7 @@ ; YAML-NEXT: Function: test_i16_extend ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-20' +; YAML-NEXT: - Cost: '-16 ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '5' ; YAML-NEXT: ... diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll index 9f5744b17cb79..929fb29a4a679 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -600,15 +600,27 @@ bb15: ; preds = %bb15, %bb14 define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) { ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks( ; CHECK-NEXT: entry: -; CHECK-NEXT: store <2 x i32> , ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01 +; CHECK-NEXT: [[TMP4:%.*]] = fptosi float [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300 +; CHECK-NEXT: [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]] +; CHECK-NEXT: store i32 [[TMP7]], ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8 ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]] ; CHECK: bb14: -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 10 to i64 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 2, [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2 ; CHECK-NEXT: store float 0.000000e+00, ptr [[TMP20]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index 07411cacb3626..8561a00490bfa 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,159 +14,387 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]] +; CHECK-NEXT: [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]] +; CHECK-NEXT: [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]] +; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]] +; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]] +; CHECK-NEXT: [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]] +; CHECK-NEXT: [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32 +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]] +; CHECK-NEXT: [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]] +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] +; CHECK-NEXT: [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]] +; CHECK-NEXT: [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]] +; CHECK-NEXT: [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2 +; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] +; CHECK-NEXT: [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]] +; CHECK-NEXT: [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]] +; CHECK-NEXT: [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2 +; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] +; CHECK-NEXT: [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]] +; CHECK-NEXT: [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]] +; CHECK-NEXT: [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2 +; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] +; CHECK-NEXT: [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]] +; CHECK-NEXT: [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]] +; CHECK-NEXT: [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2 +; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] +; CHECK-NEXT: [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]] +; CHECK-NEXT: [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]] +; CHECK-NEXT: [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2 +; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] +; CHECK-NEXT: [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]] +; CHECK-NEXT: [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]] +; CHECK-NEXT: [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2 +; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] +; CHECK-NEXT: [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]] +; CHECK-NEXT: [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]] +; CHECK-NEXT: [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2 +; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] +; CHECK-NEXT: [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]] +; CHECK-NEXT: [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] +; CHECK-NEXT: [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]] +; CHECK-NEXT: [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]] +; CHECK-NEXT: [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2 +; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] +; CHECK-NEXT: [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]] +; CHECK-NEXT: [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]] +; CHECK-NEXT: [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2 +; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] +; CHECK-NEXT: [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]] +; CHECK-NEXT: [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]] +; CHECK-NEXT: [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2 +; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32 +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] +; CHECK-NEXT: [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]] +; CHECK-NEXT: [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]] +; CHECK-NEXT: [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2 +; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] +; CHECK-NEXT: [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]] +; CHECK-NEXT: [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]] +; CHECK-NEXT: [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2 +; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] +; CHECK-NEXT: [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]] +; CHECK-NEXT: [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]] +; CHECK-NEXT: [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2 +; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32 +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] +; CHECK-NEXT: [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]] +; CHECK-NEXT: [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]] +; CHECK-NEXT: [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2 +; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32 +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] +; CHECK-NEXT: [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]] +; CHECK-NEXT: [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]] ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] +; CHECK-NEXT: [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]] +; CHECK-NEXT: [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]] +; CHECK-NEXT: [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1 +; CHECK-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2 +; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] +; CHECK-NEXT: [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]] +; CHECK-NEXT: [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]] +; CHECK-NEXT: [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2 +; CHECK-NEXT: [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2 +; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] +; CHECK-NEXT: [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]] +; CHECK-NEXT: [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]] +; CHECK-NEXT: [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3 +; CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2 +; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] +; CHECK-NEXT: [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]] +; CHECK-NEXT: [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]] +; CHECK-NEXT: [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2 +; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] +; CHECK-NEXT: [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]] +; CHECK-NEXT: [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]] +; CHECK-NEXT: [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5 +; CHECK-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2 +; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] +; CHECK-NEXT: [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]] +; CHECK-NEXT: [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]] +; CHECK-NEXT: [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6 +; CHECK-NEXT: [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2 +; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] +; CHECK-NEXT: [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]] +; CHECK-NEXT: [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]] +; CHECK-NEXT: [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2 +; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] +; CHECK-NEXT: [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]] +; CHECK-NEXT: [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]] ; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] +; CHECK-NEXT: [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]] +; CHECK-NEXT: [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]] +; CHECK-NEXT: [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2 +; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] +; CHECK-NEXT: [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]] +; CHECK-NEXT: [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]] +; CHECK-NEXT: [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2 +; CHECK-NEXT: [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2 +; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] +; CHECK-NEXT: [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]] +; CHECK-NEXT: [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]] +; CHECK-NEXT: [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3 +; CHECK-NEXT: [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2 +; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] +; CHECK-NEXT: [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]] +; CHECK-NEXT: [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]] +; CHECK-NEXT: [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4 +; CHECK-NEXT: [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2 +; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] +; CHECK-NEXT: [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]] +; CHECK-NEXT: [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]] +; CHECK-NEXT: [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5 +; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2 +; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] +; CHECK-NEXT: [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]] +; CHECK-NEXT: [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]] +; CHECK-NEXT: [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6 +; CHECK-NEXT: [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2 +; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] +; CHECK-NEXT: [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]] +; CHECK-NEXT: [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]] +; CHECK-NEXT: [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7 +; CHECK-NEXT: [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2 +; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] +; CHECK-NEXT: [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]] +; CHECK-NEXT: [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]] ; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] +; CHECK-NEXT: [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]] +; CHECK-NEXT: [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]] +; CHECK-NEXT: [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2 +; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] +; CHECK-NEXT: [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]] +; CHECK-NEXT: [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]] +; CHECK-NEXT: [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2 +; CHECK-NEXT: [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2 +; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] +; CHECK-NEXT: [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]] +; CHECK-NEXT: [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]] +; CHECK-NEXT: [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3 +; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2 +; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32 +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] +; CHECK-NEXT: [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]] +; CHECK-NEXT: [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]] +; CHECK-NEXT: [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4 +; CHECK-NEXT: [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2 +; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32 +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] +; CHECK-NEXT: [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]] +; CHECK-NEXT: [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]] +; CHECK-NEXT: [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5 +; CHECK-NEXT: [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2 +; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32 +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] +; CHECK-NEXT: [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]] +; CHECK-NEXT: [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]] +; CHECK-NEXT: [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6 +; CHECK-NEXT: [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2 +; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32 +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] +; CHECK-NEXT: [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]] +; CHECK-NEXT: [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]] +; CHECK-NEXT: [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2 +; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] +; CHECK-NEXT: [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]] +; CHECK-NEXT: [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]] ; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2 +; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] +; CHECK-NEXT: [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]] +; CHECK-NEXT: [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]] +; CHECK-NEXT: [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2 +; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32 +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] +; CHECK-NEXT: [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]] +; CHECK-NEXT: [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]] +; CHECK-NEXT: [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2 +; CHECK-NEXT: [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2 +; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32 +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] +; CHECK-NEXT: [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]] +; CHECK-NEXT: [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]] +; CHECK-NEXT: [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3 +; CHECK-NEXT: [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2 +; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] +; CHECK-NEXT: [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]] +; CHECK-NEXT: [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]] +; CHECK-NEXT: [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4 +; CHECK-NEXT: [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2 +; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] +; CHECK-NEXT: [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]] +; CHECK-NEXT: [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]] +; CHECK-NEXT: [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5 +; CHECK-NEXT: [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2 +; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32 +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] +; CHECK-NEXT: [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]] +; CHECK-NEXT: [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]] +; CHECK-NEXT: [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6 +; CHECK-NEXT: [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2 +; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] +; CHECK-NEXT: [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]] +; CHECK-NEXT: [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]] +; CHECK-NEXT: [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7 +; CHECK-NEXT: [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2 +; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] +; CHECK-NEXT: [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]] +; CHECK-NEXT: [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]] ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) -; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) -; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) -; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) -; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) -; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) -; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 +; CHECK-NEXT: [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2 +; CHECK-NEXT: [[TMP74:%.*]] = zext i16 [[TMP56]] to i32 ; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 +; CHECK-NEXT: [[MUL_766:%.*]] = mul nuw nsw i32 [[TMP74]], [[TMP74]] +; CHECK-NEXT: [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]] +; CHECK-NEXT: [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2 +; CHECK-NEXT: [[TMP75:%.*]] = zext i16 [[TMP57]] to i32 ; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 +; CHECK-NEXT: [[MUL_1_7:%.*]] = mul nuw nsw i32 [[TMP75]], [[TMP75]] +; CHECK-NEXT: [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]] +; CHECK-NEXT: [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2 +; CHECK-NEXT: [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2 +; CHECK-NEXT: [[TMP76:%.*]] = zext i16 [[TMP58]] to i32 ; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 +; CHECK-NEXT: [[MUL_2_7:%.*]] = mul nuw nsw i32 [[TMP76]], [[TMP76]] +; CHECK-NEXT: [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]] +; CHECK-NEXT: [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3 +; CHECK-NEXT: [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2 +; CHECK-NEXT: [[TMP77:%.*]] = zext i16 [[TMP59]] to i32 ; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 +; CHECK-NEXT: [[MUL_3_7:%.*]] = mul nuw nsw i32 [[TMP77]], [[TMP77]] +; CHECK-NEXT: [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]] +; CHECK-NEXT: [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4 +; CHECK-NEXT: [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2 +; CHECK-NEXT: [[TMP78:%.*]] = zext i16 [[TMP60]] to i32 ; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 +; CHECK-NEXT: [[MUL_4_7:%.*]] = mul nuw nsw i32 [[TMP78]], [[TMP78]] +; CHECK-NEXT: [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]] +; CHECK-NEXT: [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5 +; CHECK-NEXT: [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2 +; CHECK-NEXT: [[TMP79:%.*]] = zext i16 [[TMP61]] to i32 ; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 +; CHECK-NEXT: [[MUL_5_7:%.*]] = mul nuw nsw i32 [[TMP79]], [[TMP79]] +; CHECK-NEXT: [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]] +; CHECK-NEXT: [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6 +; CHECK-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2 +; CHECK-NEXT: [[TMP80:%.*]] = zext i16 [[TMP62]] to i32 ; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 +; CHECK-NEXT: [[MUL_6_7:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP80]] +; CHECK-NEXT: [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]] +; CHECK-NEXT: [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7 +; CHECK-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2 +; CHECK-NEXT: [[TMP81:%.*]] = zext i16 [[TMP63]] to i32 ; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) +; CHECK-NEXT: [[MUL_7_7:%.*]] = mul nuw nsw i32 [[TMP81]], [[TMP81]] +; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]] ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 ; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 @@ -573,13 +801,101 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-NEXT: [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ] ; CHECK-NEXT: [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ] ; CHECK-NEXT: [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]] -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]] +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]] +; CHECK-NEXT: [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]] +; CHECK-NEXT: [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]] +; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]] +; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]] +; CHECK-NEXT: [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]] +; CHECK-NEXT: [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32 +; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]] +; CHECK-NEXT: [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]] +; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2 +; CHECK-NEXT: [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]] +; CHECK-NEXT: [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]] +; CHECK-NEXT: [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]] +; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2 +; CHECK-NEXT: [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]] +; CHECK-NEXT: [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]] +; CHECK-NEXT: [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]] +; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2 +; CHECK-NEXT: [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]] +; CHECK-NEXT: [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]] +; CHECK-NEXT: [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]] +; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2 +; CHECK-NEXT: [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]] +; CHECK-NEXT: [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]] +; CHECK-NEXT: [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]] +; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2 +; CHECK-NEXT: [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]] +; CHECK-NEXT: [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]] +; CHECK-NEXT: [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]] +; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2 +; CHECK-NEXT: [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]] +; CHECK-NEXT: [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]] +; CHECK-NEXT: [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]] +; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2 +; CHECK-NEXT: [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]] +; CHECK-NEXT: [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]] +; CHECK-NEXT: [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]] +; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2 +; CHECK-NEXT: [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[OP_RDX1]] = add i32 [[ADD_14]], [[CONV_15]] +; CHECK-NEXT: [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]] +; CHECK-NEXT: [[OP_RDX]] = add i32 [[MUL_15]], [[ADD11_14]] ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[INC13]] = add nuw nsw i32 [[Y_038]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16 diff --git a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll similarity index 94% rename from llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll rename to llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll index 4478eab7b827a..15f4cffe77910 100644 --- a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) { ; CHECK-LABEL: define i32 @test( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll index afaf6b98e5081..094d60b66b393 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll @@ -90,14 +90,14 @@ entry: define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) { ; CHECK-LABEL: @splat_loads_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds i64, ptr [[ARRAY2:%.*]], i64 1 -; CHECK-NEXT: [[LD_2_0:%.*]] = load i64, ptr [[ARRAY2]], align 8 -; CHECK-NEXT: [[LD_2_1:%.*]] = load i64, ptr [[GEP_2_1]], align 8 +; CHECK-NEXT: [[GEP_2_2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY3:%.*]], i64 1 +; CHECK-NEXT: [[LD_2_2:%.*]] = load i64, ptr [[ARRAY3]], align 8 +; CHECK-NEXT: [[LD_2_3:%.*]] = load i64, ptr [[GEP_2_2]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_2]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i64> [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]] @@ -131,14 +131,14 @@ entry: define void @splat_loads_i32(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) { ; CHECK-LABEL: @splat_loads_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_2_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY2:%.*]], i64 1 -; CHECK-NEXT: [[LD_2_0:%.*]] = load i32, ptr [[ARRAY2]], align 8 -; CHECK-NEXT: [[LD_2_1:%.*]] = load i32, ptr [[GEP_2_1]], align 8 +; CHECK-NEXT: [[GEP_2_2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY3:%.*]], i64 1 +; CHECK-NEXT: [[LD_2_2:%.*]] = load i32, ptr [[ARRAY3]], align 8 +; CHECK-NEXT: [[LD_2_3:%.*]] = load i32, ptr [[GEP_2_2]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_2]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = or <2 x i32> [[TMP0]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll index aeb82d800a2f7..3c2f9e4d0ab5d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll @@ -4,17 +4,17 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1 +; CHECK-NEXT: [[G_2197_REAL32_PRE:%.*]] = load i32, ptr null, align 1 +; CHECK-NEXT: [[G_2197_IMAG33_PRE:%.*]] = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1 ; CHECK-NEXT: br label %[[IF_END:.*]] ; CHECK: [[IF_THEN:.*]]: ; CHECK-NEXT: br label %[[IF_END]] ; CHECK: [[IF_END]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[G_2197_IMAG33_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[G_2197_REAL32_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ] ; CHECK-NEXT: store i32 [[TMP2]], ptr null, align 1 ; CHECK-NEXT: br label %[[TRAP:.*]] ; CHECK: [[BB3:.*:]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: store i32 [[TMP4]], ptr null, align 1 ; CHECK-NEXT: ret void ; CHECK: [[TRAP]]: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll index 3cb81b72d26a1..14ce08cb7aebe 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll @@ -6,19 +6,36 @@ define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) { ; CHECK-LABEL: define void @should_vectorize_gep ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) { ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; CHECK-NEXT: [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; CHECK-NEXT: [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64 +; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]] +; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1 +; CHECK-NEXT: [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1 +; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2 +; CHECK-NEXT: [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64 +; CHECK-NEXT: [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2 +; CHECK-NEXT: [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64 +; CHECK-NEXT: [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]] +; CHECK-NEXT: [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2 +; CHECK-NEXT: [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2 +; CHECK-NEXT: [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2 +; CHECK-NEXT: [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64 +; CHECK-NEXT: [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2 +; CHECK-NEXT: [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64 +; CHECK-NEXT: [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]] +; CHECK-NEXT: [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3 +; CHECK-NEXT: [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3 +; CHECK-NEXT: [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2 +; CHECK-NEXT: [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64 +; CHECK-NEXT: [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2 +; CHECK-NEXT: [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64 +; CHECK-NEXT: [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]] ; CHECK-NEXT: call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index 00bd3eb232981..15518c9c57140 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -249,7 +249,7 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1 ; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll new file mode 100644 index 0000000000000..f90456297d7cb --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test() { +; CHECK-LABEL: define void @test() { +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> , i32 [[PHI7]], i32 0 +; CHECK-NEXT: switch i32 0, label [[BB16:%.*]] [ +; CHECK-NEXT: i32 0, label [[BB14:%.*]] +; CHECK-NEXT: i32 1, label [[BB11:%.*]] +; CHECK-NEXT: ] +; CHECK: bb9: +; CHECK-NEXT: br label [[BB11]] +; CHECK: bb10: +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb11: +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ poison, [[BB9:%.*]] ], [ [[TMP1]], [[BB1]] ] +; CHECK-NEXT: ret void +; CHECK: bb14: +; CHECK-NEXT: ret void +; CHECK: bb15: +; CHECK-NEXT: ret void +; CHECK: bb16: +; CHECK-NEXT: [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ], [ poison, [[BB25:%.*]] ] +; CHECK-NEXT: ret void +; CHECK: bb25: +; CHECK-NEXT: switch i32 0, label [[BB16]] [ +; CHECK-NEXT: i32 0, label [[BB14]] +; CHECK-NEXT: i32 1, label [[BB15:%.*]] +; CHECK-NEXT: ] +; +bb: + br label %bb1 + +bb1: + %phi = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi2 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi3 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi4 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi5 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi6 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi7 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + %phi8 = phi i32 [ 0, %bb10 ], [ 0, %bb ] + switch i32 0, label %bb16 [ + i32 0, label %bb14 + i32 1, label %bb11 + ] + +bb9: + br label %bb11 + +bb10: + br label %bb1 + +bb11: + %phi12 = phi i32 [ 0, %bb9 ], [ %phi7, %bb1 ] + %phi13 = phi i32 [ 0, %bb9 ], [ undef, %bb1 ] + ret void + +bb14: + ret void + +bb15: + ret void + +bb16: + %phi17 = phi i32 [ %phi, %bb1 ], [ 0, %bb25 ] + %phi18 = phi i32 [ %phi2, %bb1 ], [ 0, %bb25 ] + %phi19 = phi i32 [ %phi3, %bb1 ], [ 0, %bb25 ] + %phi20 = phi i32 [ %phi4, %bb1 ], [ 0, %bb25 ] + %phi21 = phi i32 [ %phi5, %bb1 ], [ 0, %bb25 ] + %phi22 = phi i32 [ %phi6, %bb1 ], [ 0, %bb25 ] + %phi23 = phi i32 [ %phi7, %bb1 ], [ 0, %bb25 ] + %phi24 = phi i32 [ %phi8, %bb1 ], [ 0, %bb25 ] + ret void + +bb25: + switch i32 0, label %bb16 [ + i32 0, label %bb14 + i32 1, label %bb15 + ] +} diff --git a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll similarity index 91% rename from llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll rename to llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll index 002b9a70255da..278e55c67f23f 100644 --- a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %} -; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %} +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s define i32 @test(ptr %sptr, i64 %0) { ; CHECK-LABEL: define i32 @test( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll new file mode 100644 index 0000000000000..0dac02b0bcc09 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) { +; CHECK-LABEL: define i32 @test( +; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], splat (i32 16) +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16> +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16) +; CHECK-NEXT: [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], splat (i32 16) +; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16> +; CHECK-NEXT: br i1 true, label [[BB3]], label [[BB2]] +; CHECK: bb3: +; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: store i32 [[TMP18]], ptr [[B]], align 16 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = sext i16 [[TMP19]] to i32 +; CHECK-NEXT: store i32 [[TMP20]], ptr [[A]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: store i32 [[TMP22]], ptr [[C]], align 16 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = sext i16 [[TMP23]] to i32 +; CHECK-NEXT: store i32 [[TMP24]], ptr [[B]], align 8 +; CHECK-NEXT: ret i32 0 +; +entry: + br i1 %tobool3.not, label %bb1, label %bb2 + +bb1: + %conv1.i.us = ashr i32 %0, 16 + %cmp2.i.us = icmp slt i32 %conv1.i.us, %0 + %sext26.us = zext i1 %cmp2.i.us to i32 + %conv1.i.us.5 = ashr i32 %0, 16 + %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0 + %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32 + %conv1.i.us.6 = ashr i32 %0, 16 + %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0 + %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32 + %conv1.i.us.7 = ashr i32 %0, 16 + %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0 + %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32 + br label %bb3 + +bb2: + %cmp2.i = icmp sgt i32 %0, 0 + %1 = zext i1 %cmp2.i to i32 + %cond.i = select i1 %tobool3.not, i32 %0, i32 %1 + %sext26 = shl i32 %cond.i, 16 + %conv13 = ashr i32 %sext26, 16 + %cmp2.i.5 = icmp sgt i32 %0, 0 + %2 = zext i1 %cmp2.i.5 to i32 + %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2 + %sext26.5 = shl i32 %cond.i.5, 16 + %conv13.5 = ashr i32 %sext26.5, 16 + %cmp2.i.6 = icmp sgt i32 %0, 0 + %3 = zext i1 %cmp2.i.6 to i32 + %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3 + %sext26.6 = shl i32 %cond.i.6, 16 + %conv13.6 = ashr i32 %sext26.6, 16 + %cmp2.i.7 = icmp sgt i32 %0, 0 + %4 = zext i1 %cmp2.i.7 to i32 + %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4 + %sext26.7 = shl i32 %cond.i.7, 16 + %conv13.7 = ashr i32 %sext26.7, 16 + br i1 true, label %bb3, label %bb2 + +bb3: + %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ] + %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ] + %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ] + %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ] + store i32 %conv13p, ptr %b, align 16 + store i32 %conv13.5p, ptr %a, align 8 + store i32 %conv13.6p, ptr %c, align 16 + store i32 %conv13.7p, ptr %b, align 8 + ret i32 0 +} diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll index 9b6511d0d8284..d880c6b1783c8 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll @@ -17,12 +17,12 @@ define <2 x i32> @test(i32 %arg) { ; AARCH64-LABEL: define <2 x i32> @test( ; AARCH64-SAME: i32 [[ARG:%.*]]) { ; AARCH64-NEXT: bb: -; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 -; AARCH64-NEXT: [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer -; AARCH64-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; AARCH64-NEXT: [[TMP2:%.*]] = or i32 [[ARG]], 0 +; AARCH64-NEXT: [[TMP3:%.*]] = mul i32 0, 1 ; AARCH64-NEXT: [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]] ; AARCH64-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 +; AARCH64-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP3]], i32 1 ; AARCH64-NEXT: ret <2 x i32> [[TMP1]] ; bb: diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll index e6e5f5196d3da..5c035d29a7ea2 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll @@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) { ; Scalarizing the load for multiple constant indices may not be profitable. define i32 @load_multiple_extracts_with_constant_idx(ptr %x) { ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx( -; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]] -; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP1]], i32 0, i32 1 +; CHECK-NEXT: [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: ret i32 [[RES]] ; %lv = load <4 x i32>, ptr %x @@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) { ; because the vector large vector requires 2 vector registers. define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) { ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable( -; CHECK-NEXT: [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]] -; CHECK-NEXT: [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[TMP1]], i32 0, i32 6 +; CHECK-NEXT: [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: ret i32 [[RES]] ; %lv = load <8 x i32>, ptr %x, align 16